btVector3.cpp

Go to the documentation of this file.
00001 /*
00002  Copyright (c) 2011 Apple Inc.
00003  http://continuousphysics.com/Bullet/
00004  
00005  This software is provided 'as-is', without any express or implied warranty.
00006  In no event will the authors be held liable for any damages arising from the use of this software.
00007  Permission is granted to anyone to use this software for any purpose, 
00008  including commercial applications, and to alter it and redistribute it freely, 
00009  subject to the following restrictions:
00010  
00011  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
00012  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
00013  3. This notice may not be removed or altered from any source distribution.
00014  
00015  This source version has been altered.
00016  */
00017 
00018 #if defined (_WIN32) || defined (__i386__)
00019 #define BT_USE_SSE_IN_API
00020 #endif
00021 
00022 #include "btVector3.h"
00023 
00024 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
00025 
00026 #ifdef __APPLE__
00027 #include <stdint.h>
00028 typedef  float float4 __attribute__ ((vector_size(16)));
00029 #else
00030 #define float4 __m128
00031 #endif
00032 //typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
00033 
00034 
00035 #if defined BT_USE_SSE || defined _WIN32
00036 
00037 #define LOG2_ARRAY_SIZE     6
00038 #define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)
00039 
00040 #include <emmintrin.h>
00041 
00042 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
00043 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
00044 {
00045     const float4 *vertices = (const float4*) vv;
00046     static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
00047     float4 dotMax = btAssign128( -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY,  -BT_INFINITY );
00048     float4 vvec = _mm_loadu_ps( vec );
00049     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          
00050     float4 vLo = _mm_movelh_ps( vvec, vvec );                               
00051     
00052     long maxIndex = -1L;
00053     
00054     size_t segment = 0;
00055     float4 stack_array[ STACK_ARRAY_COUNT ];
00056     
00057 #if DEBUG
00058     memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
00059 #endif
00060     
00061     size_t index;
00062     float4 max;
00063     // Faster loop without cleanup code for full tiles
00064     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
00065     {
00066         max = dotMax;
00067         
00068         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
00069         { // do four dot products at a time. Carefully avoid touching the w element.
00070             float4 v0 = vertices[0];
00071             float4 v1 = vertices[1];
00072             float4 v2 = vertices[2];
00073             float4 v3 = vertices[3];            vertices += 4;
00074             
00075             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00076             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00077             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00078             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00079             
00080             lo0 = lo0*vLo;
00081             lo1 = lo1*vLo;
00082             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00083             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00084             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00085             z = z*vHi;
00086             x = x+y;
00087             x = x+z;
00088             stack_array[index] = x;
00089             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00090             
00091             v0 = vertices[0];
00092             v1 = vertices[1];
00093             v2 = vertices[2];
00094             v3 = vertices[3];            vertices += 4;
00095             
00096             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00097             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00098             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00099             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00100             
00101             lo0 = lo0*vLo;
00102             lo1 = lo1*vLo;
00103             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00104             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00105             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00106             z = z*vHi;
00107             x = x+y;
00108             x = x+z;
00109             stack_array[index+1] = x;
00110             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00111             
00112             v0 = vertices[0];
00113             v1 = vertices[1];
00114             v2 = vertices[2];
00115             v3 = vertices[3];            vertices += 4;
00116             
00117             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00118             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00119             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00120             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00121             
00122             lo0 = lo0*vLo;
00123             lo1 = lo1*vLo;
00124             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00125             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00126             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00127             z = z*vHi;
00128             x = x+y;
00129             x = x+z;
00130             stack_array[index+2] = x;
00131             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00132             
00133             v0 = vertices[0];
00134             v1 = vertices[1];
00135             v2 = vertices[2];
00136             v3 = vertices[3];            vertices += 4;
00137             
00138             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00139             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00140             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00141             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00142             
00143             lo0 = lo0*vLo;
00144             lo1 = lo1*vLo;
00145             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00146             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00147             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00148             z = z*vHi;
00149             x = x+y;
00150             x = x+z;
00151             stack_array[index+3] = x;
00152             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00153             
00154             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
00155         }
00156         
00157         // If we found a new max
00158         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
00159         { 
00160             // copy the new max across all lanes of our max accumulator
00161             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
00162             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
00163             
00164             dotMax = max;
00165             
00166             // find first occurrence of that max  
00167             size_t test;
00168             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
00169             {}
00170             // record where it is.
00171             maxIndex = 4*index + segment + indexTable[test];
00172         }
00173     }
00174     
00175     // account for work we've already done
00176     count -= segment;
00177     
00178     // Deal with the last < STACK_ARRAY_COUNT vectors
00179     max = dotMax;
00180     index = 0;
00181     
00182     
00183     if( btUnlikely( count > 16) )
00184     {
00185         for( ; index + 4 <= count / 4; index+=4 )   
00186         { // do four dot products at a time. Carefully avoid touching the w element.
00187             float4 v0 = vertices[0];
00188             float4 v1 = vertices[1];
00189             float4 v2 = vertices[2];
00190             float4 v3 = vertices[3];            vertices += 4;
00191             
00192             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00193             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00194             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00195             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00196             
00197             lo0 = lo0*vLo;
00198             lo1 = lo1*vLo;
00199             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00200             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00201             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00202             z = z*vHi;
00203             x = x+y;
00204             x = x+z;
00205             stack_array[index] = x;
00206             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00207             
00208             v0 = vertices[0];
00209             v1 = vertices[1];
00210             v2 = vertices[2];
00211             v3 = vertices[3];            vertices += 4;
00212             
00213             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00214             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00215             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00216             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00217             
00218             lo0 = lo0*vLo;
00219             lo1 = lo1*vLo;
00220             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00221             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00222             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00223             z = z*vHi;
00224             x = x+y;
00225             x = x+z;
00226             stack_array[index+1] = x;
00227             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00228             
00229             v0 = vertices[0];
00230             v1 = vertices[1];
00231             v2 = vertices[2];
00232             v3 = vertices[3];            vertices += 4;
00233             
00234             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00235             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00236             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00237             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00238             
00239             lo0 = lo0*vLo;
00240             lo1 = lo1*vLo;
00241             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00242             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00243             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00244             z = z*vHi;
00245             x = x+y;
00246             x = x+z;
00247             stack_array[index+2] = x;
00248             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00249             
00250             v0 = vertices[0];
00251             v1 = vertices[1];
00252             v2 = vertices[2];
00253             v3 = vertices[3];            vertices += 4;
00254             
00255             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00256             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00257             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00258             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00259             
00260             lo0 = lo0*vLo;
00261             lo1 = lo1*vLo;
00262             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00263             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00264             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00265             z = z*vHi;
00266             x = x+y;
00267             x = x+z;
00268             stack_array[index+3] = x;
00269             max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00270             
00271             // It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
00272         }
00273     }
00274     
00275     size_t localCount = (count & -4L) - 4*index;
00276     if( localCount )
00277     {
00278 #ifdef __APPLE__
00279         float4 t0, t1, t2, t3, t4;
00280         float4 * sap = &stack_array[index + localCount / 4];
00281           vertices += localCount;      // counter the offset
00282          size_t byteIndex = -(localCount) * sizeof(float);
00283         //AT&T Code style assembly
00284         asm volatile
00285         (   ".align 4                                                                   \n\
00286              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
00287           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
00288           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
00289           movaps  %[t0], %[max]                               // vertices[0]      \n\
00290           movlhps %[t1], %[max]                               // x0y0x1y1         \n\
00291          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
00292          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
00293           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
00294          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
00295          movaps  %[t3], %[t0]                                // vertices[2]      \n\
00296          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
00297          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
00298           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
00299           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
00300           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
00301          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
00302          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
00303          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
00304          addps   %[t3], %[max]                               // x + y            \n\
00305          addps   %[t1], %[max]                               // x + y + z        \n\
00306          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
00307          maxps   %[t2], %[max]                               // record max, restore max   \n\
00308          add     $16, %[byteIndex]                           // advance loop counter\n\
00309          jnz     0b                                          \n\
00310      "
00311          : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
00312          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
00313          : "memory", "cc"
00314          );
00315         index += localCount/4;
00316 #else
00317         {
00318             for( unsigned int i=0; i<localCount/4; i++,index++)   
00319             { // do four dot products at a time. Carefully avoid touching the w element.
00320                 float4 v0 = vertices[0];
00321                 float4 v1 = vertices[1];
00322                 float4 v2 = vertices[2];
00323                 float4 v3 = vertices[3];            
00324                 vertices += 4;
00325                 
00326                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00327                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00328                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00329                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00330                 
00331                 lo0 = lo0*vLo;
00332                 lo1 = lo1*vLo;
00333                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00334                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00335                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00336                 z = z*vHi;
00337                 x = x+y;
00338                 x = x+z;
00339                 stack_array[index] = x;
00340                 max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00341             }
00342         }
00343 #endif //__APPLE__
00344     }
00345 
00346     // process the last few points
00347     if( count & 3 )
00348     {
00349         float4 v0, v1, v2, x, y, z;
00350         switch( count & 3 )
00351         {
00352             case 3:
00353             {
00354                 v0 = vertices[0];
00355                 v1 = vertices[1];
00356                 v2 = vertices[2];
00357                 
00358                 // Calculate 3 dot products, transpose, duplicate v2
00359                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
00360                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
00361                 lo0 = lo0*vLo;
00362                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
00363                 z = z*vHi;
00364                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
00365                 lo1 = lo1*vLo;
00366                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00367                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00368             }
00369                 break;
00370             case 2:
00371             {
00372                 v0 = vertices[0];
00373                 v1 = vertices[1];
00374                 float4 xy = _mm_movelh_ps(v0, v1);
00375                 z = _mm_movehl_ps(v1, v0);
00376                 xy = xy*vLo;
00377                 z = _mm_shuffle_ps( z, z,  0xa8);
00378                 x = _mm_shuffle_ps( xy, xy, 0xa8);
00379                 y = _mm_shuffle_ps( xy, xy, 0xfd);
00380                 z = z*vHi;
00381             }
00382                 break;
00383             case 1:
00384             {
00385                 float4 xy = vertices[0];
00386                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
00387                 xy = xy*vLo;
00388                 z = z*vHi;
00389                 x = _mm_shuffle_ps(xy, xy, 0);
00390                 y = _mm_shuffle_ps(xy, xy, 0x55);
00391             }
00392                 break;
00393         }
00394         x = x+y;
00395         x = x+z;
00396         stack_array[index] = x;
00397         max = _mm_max_ps( x, max );         // control the order here so that max is never NaN even if x is nan
00398         index++;
00399     }
00400     
00401     // if we found a new max. 
00402     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
00403     { // we found a new max. Search for it
00404       // find max across the max vector, place in all elements of max -- big latency hit here
00405         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
00406         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
00407         
00408         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
00409         // this where it actually makes a difference is handled in the early out at the top of the function, 
00410         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
00411         // complexity, and removed it.
00412         
00413         dotMax = max;
00414         
00415         // scan for the first occurence of max in the array  
00416         size_t test;
00417         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   // local_count must be a multiple of 4
00418         {}
00419         maxIndex = 4*index + segment + indexTable[test];
00420     }
00421     
00422     _mm_store_ss( dotResult, dotMax);
00423     return maxIndex;
00424 }
00425 
00426 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
00427 
00428 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
00429 {
00430     const float4 *vertices = (const float4*) vv;
00431     static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
00432     float4 dotmin = btAssign128( BT_INFINITY,  BT_INFINITY,  BT_INFINITY,  BT_INFINITY );
00433     float4 vvec = _mm_loadu_ps( vec );
00434     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          
00435     float4 vLo = _mm_movelh_ps( vvec, vvec );                               
00436     
00437     long minIndex = -1L;
00438 
00439     size_t segment = 0;
00440     float4 stack_array[ STACK_ARRAY_COUNT ];
00441     
00442 #if DEBUG
00443     memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
00444 #endif
00445     
00446     size_t index;
00447     float4 min;
00448     // Faster loop without cleanup code for full tiles
00449     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
00450     {
00451         min = dotmin;
00452         
00453         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
00454         { // do four dot products at a time. Carefully avoid touching the w element.
00455             float4 v0 = vertices[0];
00456             float4 v1 = vertices[1];
00457             float4 v2 = vertices[2];
00458             float4 v3 = vertices[3];            vertices += 4;
00459             
00460             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00461             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00462             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00463             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00464             
00465             lo0 = lo0*vLo;
00466             lo1 = lo1*vLo;
00467             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00468             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00469             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00470             z = z*vHi;
00471             x = x+y;
00472             x = x+z;
00473             stack_array[index] = x;
00474             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00475             
00476             v0 = vertices[0];
00477             v1 = vertices[1];
00478             v2 = vertices[2];
00479             v3 = vertices[3];            vertices += 4;
00480             
00481             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00482             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00483             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00484             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00485             
00486             lo0 = lo0*vLo;
00487             lo1 = lo1*vLo;
00488             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00489             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00490             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00491             z = z*vHi;
00492             x = x+y;
00493             x = x+z;
00494             stack_array[index+1] = x;
00495             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00496             
00497             v0 = vertices[0];
00498             v1 = vertices[1];
00499             v2 = vertices[2];
00500             v3 = vertices[3];            vertices += 4;
00501             
00502             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00503             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00504             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00505             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00506             
00507             lo0 = lo0*vLo;
00508             lo1 = lo1*vLo;
00509             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00510             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00511             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00512             z = z*vHi;
00513             x = x+y;
00514             x = x+z;
00515             stack_array[index+2] = x;
00516             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00517             
00518             v0 = vertices[0];
00519             v1 = vertices[1];
00520             v2 = vertices[2];
00521             v3 = vertices[3];            vertices += 4;
00522             
00523             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00524             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00525             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00526             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00527             
00528             lo0 = lo0*vLo;
00529             lo1 = lo1*vLo;
00530             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00531             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00532             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00533             z = z*vHi;
00534             x = x+y;
00535             x = x+z;
00536             stack_array[index+3] = x;
00537             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00538             
00539             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
00540         }
00541         
00542         // If we found a new min
00543         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
00544         { 
00545             // copy the new min across all lanes of our min accumulator
00546             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
00547             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
00548             
00549             dotmin = min;
00550             
00551             // find first occurrence of that min  
00552             size_t test;
00553             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
00554             {}
00555             // record where it is.
00556             minIndex = 4*index + segment + indexTable[test];
00557         }
00558     }
00559     
00560     // account for work we've already done
00561     count -= segment;
00562     
00563     // Deal with the last < STACK_ARRAY_COUNT vectors
00564     min = dotmin;
00565     index = 0;
00566     
00567     
00568     if(btUnlikely( count > 16) )
00569     {
00570         for( ; index + 4 <= count / 4; index+=4 )   
00571         { // do four dot products at a time. Carefully avoid touching the w element.
00572             float4 v0 = vertices[0];
00573             float4 v1 = vertices[1];
00574             float4 v2 = vertices[2];
00575             float4 v3 = vertices[3];            vertices += 4;
00576             
00577             float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00578             float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00579             float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00580             float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00581             
00582             lo0 = lo0*vLo;
00583             lo1 = lo1*vLo;
00584             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00585             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00586             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00587             z = z*vHi;
00588             x = x+y;
00589             x = x+z;
00590             stack_array[index] = x;
00591             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00592             
00593             v0 = vertices[0];
00594             v1 = vertices[1];
00595             v2 = vertices[2];
00596             v3 = vertices[3];            vertices += 4;
00597             
00598             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00599             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00600             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00601             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00602             
00603             lo0 = lo0*vLo;
00604             lo1 = lo1*vLo;
00605             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00606             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00607             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00608             z = z*vHi;
00609             x = x+y;
00610             x = x+z;
00611             stack_array[index+1] = x;
00612             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00613             
00614             v0 = vertices[0];
00615             v1 = vertices[1];
00616             v2 = vertices[2];
00617             v3 = vertices[3];            vertices += 4;
00618             
00619             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00620             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00621             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00622             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00623             
00624             lo0 = lo0*vLo;
00625             lo1 = lo1*vLo;
00626             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00627             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00628             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00629             z = z*vHi;
00630             x = x+y;
00631             x = x+z;
00632             stack_array[index+2] = x;
00633             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00634             
00635             v0 = vertices[0];
00636             v1 = vertices[1];
00637             v2 = vertices[2];
00638             v3 = vertices[3];            vertices += 4;
00639             
00640             lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00641             hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00642             lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00643             hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00644             
00645             lo0 = lo0*vLo;
00646             lo1 = lo1*vLo;
00647             z = _mm_shuffle_ps(hi0, hi1, 0x88);
00648             x = _mm_shuffle_ps(lo0, lo1, 0x88);
00649             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00650             z = z*vHi;
00651             x = x+y;
00652             x = x+z;
00653             stack_array[index+3] = x;
00654             min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00655             
00656             // It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
00657         }
00658     }
00659     
00660     size_t localCount = (count & -4L) - 4*index;
00661     if( localCount )
00662     {
00663         
00664         
00665 #ifdef __APPLE__
00666         vertices += localCount;      // counter the offset
00667         float4 t0, t1, t2, t3, t4;
00668         size_t byteIndex = -(localCount) * sizeof(float);
00669         float4 * sap = &stack_array[index + localCount / 4];
00670         
00671         asm volatile
00672         (   ".align 4                                                                   \n\
00673              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
00674              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
00675              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
00676              movaps  %[t0], %[min]                               // vertices[0]      \n\
00677              movlhps %[t1], %[min]                               // x0y0x1y1         \n\
00678              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
00679              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
00680              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
00681              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
00682              movaps  %[t3], %[t0]                                // vertices[2]      \n\
00683              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
00684              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
00685              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
00686              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
00687              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
00688              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
00689              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
00690              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
00691              addps   %[t3], %[min]                               // x + y            \n\
00692              addps   %[t1], %[min]                               // x + y + z        \n\
00693              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
00694              minps   %[t2], %[min]                               // record min, restore min   \n\
00695              add     $16, %[byteIndex]                           // advance loop counter\n\
00696              jnz     0b                                          \n\
00697              "
00698          : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
00699          : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
00700          : "memory", "cc"
00701          );
00702         index += localCount/4;
00703 #else
00704         {
00705             for( unsigned int i=0; i<localCount/4; i++,index++)   
00706             { // do four dot products at a time. Carefully avoid touching the w element.
00707                 float4 v0 = vertices[0];
00708                 float4 v1 = vertices[1];
00709                 float4 v2 = vertices[2];
00710                 float4 v3 = vertices[3];            
00711                 vertices += 4;
00712                 
00713                 float4 lo0 = _mm_movelh_ps( v0, v1);    // x0y0x1y1
00714                 float4 hi0 = _mm_movehl_ps( v1, v0);    // z0?0z1?1
00715                 float4 lo1 = _mm_movelh_ps( v2, v3);    // x2y2x3y3
00716                 float4 hi1 = _mm_movehl_ps( v3, v2);    // z2?2z3?3
00717                 
00718                 lo0 = lo0*vLo;
00719                 lo1 = lo1*vLo;
00720                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00721                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00722                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00723                 z = z*vHi;
00724                 x = x+y;
00725                 x = x+z;
00726                 stack_array[index] = x;
00727                 min = _mm_min_ps( x, min );         // control the order here so that max is never NaN even if x is nan
00728             }
00729         }
00730 
00731 #endif
00732     }
00733     
00734     // process the last few points
00735     if( count & 3 )
00736     {
00737         float4 v0, v1, v2, x, y, z;
00738         switch( count & 3 )
00739         {
00740             case 3:
00741             {
00742                 v0 = vertices[0];
00743                 v1 = vertices[1];
00744                 v2 = vertices[2];
00745                 
00746                 // Calculate 3 dot products, transpose, duplicate v2
00747                 float4 lo0 = _mm_movelh_ps( v0, v1);        // xyxy.lo
00748                 float4 hi0 = _mm_movehl_ps( v1, v0);        // z?z?.lo
00749                 lo0 = lo0*vLo;
00750                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           // z0z1z2z2
00751                 z = z*vHi;
00752                 float4 lo1 = _mm_movelh_ps(v2, v2);          // xyxy
00753                 lo1 = lo1*vLo;
00754                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00755                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00756             }
00757                 break;
00758             case 2:
00759             {
00760                 v0 = vertices[0];
00761                 v1 = vertices[1];
00762                 float4 xy = _mm_movelh_ps(v0, v1);
00763                 z = _mm_movehl_ps(v1, v0);
00764                 xy = xy*vLo;
00765                 z = _mm_shuffle_ps( z, z,  0xa8);
00766                 x = _mm_shuffle_ps( xy, xy, 0xa8);
00767                 y = _mm_shuffle_ps( xy, xy, 0xfd);
00768                 z = z*vHi;
00769             }
00770                 break;
00771             case 1:
00772             {
00773                 float4 xy = vertices[0];
00774                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
00775                 xy = xy*vLo;
00776                 z = z*vHi;
00777                 x = _mm_shuffle_ps(xy, xy, 0);
00778                 y = _mm_shuffle_ps(xy, xy, 0x55);
00779             }
00780                 break;
00781         }
00782         x = x+y;
00783         x = x+z;
00784         stack_array[index] = x;
00785         min = _mm_min_ps( x, min );         // control the order here so that min is never NaN even if x is nan
00786         index++;
00787     }
00788     
00789     // if we found a new min. 
00790     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
00791     { // we found a new min. Search for it
00792       // find min across the min vector, place in all elements of min -- big latency hit here
00793         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
00794         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
00795         
00796         // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
00797         // this where it actually makes a difference is handled in the early out at the top of the function, 
00798         // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced 
00799         // complexity, and removed it.
00800         
00801         dotmin = min;
00802         
00803         // scan for the first occurence of min in the array  
00804         size_t test;
00805         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   // local_count must be a multiple of 4
00806         {}
00807         minIndex = 4*index + segment + indexTable[test];
00808     }
00809     
00810     _mm_store_ss( dotResult, dotmin);
00811     return minIndex;
00812 }
00813 
00814 
00815 #elif defined BT_USE_NEON
00816 #define ARM_NEON_GCC_COMPATIBILITY  1
00817 #include <arm_neon.h>
00818 
00819 
00820 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
00821 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
00822 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
00823 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
00824 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
00825 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
00826 
00827 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
00828 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
00829 
00830 extern "C" {int  _get_cpu_capabilities( void );}
00831 
00832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
00833 {
00834     if( _get_cpu_capabilities() & 0x2000 )
00835         _maxdot_large = _maxdot_large_v1;
00836     else
00837         _maxdot_large = _maxdot_large_v0;
00838     
00839     return _maxdot_large(vv, vec, count, dotResult);
00840 }
00841 
00842 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
00843 {
00844     if( _get_cpu_capabilities() & 0x2000 )
00845         _mindot_large = _mindot_large_v1;
00846     else
00847         _mindot_large = _mindot_large_v0;
00848     
00849     return _mindot_large(vv, vec, count, dotResult);
00850 }
00851 
00852 
00853 
00854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
00855 
00856 
00857 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
00858 {
00859     unsigned long i = 0;
00860     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
00861     float32x2_t vLo = vget_low_f32(vvec);
00862     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
00863     float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
00864     float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
00865     uint32x2_t indexLo = (uint32x2_t) {0, 1};
00866     uint32x2_t indexHi = (uint32x2_t) {2, 3};
00867     uint32x2_t iLo = (uint32x2_t) {-1, -1};
00868     uint32x2_t iHi = (uint32x2_t) {-1, -1};
00869     const uint32x2_t four = (uint32x2_t) {4,4};
00870 
00871     for( ; i+8 <= count; i+= 8 )
00872     {
00873         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00874         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00875         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00876         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
00877         
00878         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00879         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00880         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00881         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
00882         
00883         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00884         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00885         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00886         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
00887         
00888         float32x2_t rLo = vpadd_f32( xy0, xy1);
00889         float32x2_t rHi = vpadd_f32( xy2, xy3);
00890         rLo = vadd_f32(rLo, zLo);
00891         rHi = vadd_f32(rHi, zHi);
00892         
00893         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00894         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00895         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00896         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00897         iLo = vbsl_u32(maskLo, indexLo, iLo);
00898         iHi = vbsl_u32(maskHi, indexHi, iHi);
00899         indexLo = vadd_u32(indexLo, four); 
00900         indexHi = vadd_u32(indexHi, four);
00901 
00902         v0 = vld1q_f32_aligned_postincrement( vv );
00903         v1 = vld1q_f32_aligned_postincrement( vv );
00904         v2 = vld1q_f32_aligned_postincrement( vv );
00905         v3 = vld1q_f32_aligned_postincrement( vv );
00906         
00907         xy0 = vmul_f32( vget_low_f32(v0), vLo);
00908         xy1 = vmul_f32( vget_low_f32(v1), vLo);
00909         xy2 = vmul_f32( vget_low_f32(v2), vLo);
00910         xy3 = vmul_f32( vget_low_f32(v3), vLo);
00911         
00912         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00913         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00914         zLo = vmul_f32( z0.val[0], vHi);
00915         zHi = vmul_f32( z1.val[0], vHi);
00916         
00917         rLo = vpadd_f32( xy0, xy1);
00918         rHi = vpadd_f32( xy2, xy3);
00919         rLo = vadd_f32(rLo, zLo);
00920         rHi = vadd_f32(rHi, zHi);
00921         
00922         maskLo = vcgt_f32( rLo, dotMaxLo );
00923         maskHi = vcgt_f32( rHi, dotMaxHi );
00924         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00925         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00926         iLo = vbsl_u32(maskLo, indexLo, iLo);
00927         iHi = vbsl_u32(maskHi, indexHi, iHi);
00928         indexLo = vadd_u32(indexLo, four);
00929         indexHi = vadd_u32(indexHi, four);
00930     }
00931 
00932     for( ; i+4 <= count; i+= 4 )
00933     {
00934         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00935         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00936         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00937         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
00938         
00939         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00940         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00941         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00942         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
00943         
00944         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00945         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00946         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00947         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
00948         
00949         float32x2_t rLo = vpadd_f32( xy0, xy1);
00950         float32x2_t rHi = vpadd_f32( xy2, xy3);
00951         rLo = vadd_f32(rLo, zLo);
00952         rHi = vadd_f32(rHi, zHi);
00953         
00954         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00955         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00956         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00957         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00958         iLo = vbsl_u32(maskLo, indexLo, iLo);
00959         iHi = vbsl_u32(maskHi, indexHi, iHi);
00960         indexLo = vadd_u32(indexLo, four);
00961         indexHi = vadd_u32(indexHi, four);
00962     }
00963     
00964     switch( count & 3 )
00965     {
00966         case 3:
00967         {
00968             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00969             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00970             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00971             
00972             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00973             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00974             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00975             
00976             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00977             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00978             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
00979             
00980             float32x2_t rLo = vpadd_f32( xy0, xy1);
00981             float32x2_t rHi = vpadd_f32( xy2, xy2);
00982             rLo = vadd_f32(rLo, zLo);
00983             rHi = vadd_f32(rHi, zHi);
00984             
00985             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00986             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00987             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00988             dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00989             iLo = vbsl_u32(maskLo, indexLo, iLo);
00990             iHi = vbsl_u32(maskHi, indexHi, iHi);
00991         }
00992             break;
00993         case 2:
00994         {
00995             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00996             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00997             
00998             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00999             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01000             
01001             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01002             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01003             
01004             float32x2_t rLo = vpadd_f32( xy0, xy1);
01005             rLo = vadd_f32(rLo, zLo);
01006             
01007             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
01008             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
01009             iLo = vbsl_u32(maskLo, indexLo, iLo);
01010         }
01011             break;
01012         case 1:
01013         {
01014             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01015             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01016             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
01017             float32x2_t zLo = vmul_f32( z0, vHi);
01018             float32x2_t rLo = vpadd_f32( xy0, xy0);
01019             rLo = vadd_f32(rLo, zLo);
01020             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
01021             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
01022             iLo = vbsl_u32(maskLo, indexLo, iLo);
01023         }
01024             break;
01025         
01026         default:
01027             break;
01028     }
01029     
01030     // select best answer between hi and lo results
01031     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
01032     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
01033     iLo = vbsl_u32(mask, iHi, iLo);
01034     
01035     // select best answer between even and odd results
01036     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
01037     iHi = vdup_lane_u32(iLo, 1);
01038     mask = vcgt_f32( dotMaxHi, dotMaxLo );
01039     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
01040     iLo = vbsl_u32(mask, iHi, iLo);
01041     
01042     *dotResult = vget_lane_f32( dotMaxLo, 0);
01043     return vget_lane_u32(iLo, 0);
01044 }
01045 
01046 
01047 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
01048 {
01049     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01050     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
01051     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
01052     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
01053     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
01054     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
01055     float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
01056     
01057     unsigned long i = 0;
01058     for( ; i + 8 <= count; i += 8 )
01059     {
01060         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01061         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01062         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01063         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01064         
01065         // the next two lines should resolve to a single vswp d, d
01066         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01067         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01068         // the next two lines should resolve to a single vswp d, d
01069         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01070         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01071         
01072         xy0 = vmulq_f32(xy0, vLo);
01073         xy1 = vmulq_f32(xy1, vLo);
01074         
01075         float32x4x2_t zb = vuzpq_f32( z0, z1);
01076         float32x4_t z = vmulq_f32( zb.val[0], vHi);
01077         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01078         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01079         x = vaddq_f32(x, z);
01080         
01081         uint32x4_t mask = vcgtq_f32(x, maxDot);
01082         maxDot = vbslq_f32( mask, x, maxDot);
01083         index = vbslq_u32(mask, local_index, index);
01084         local_index = vaddq_u32(local_index, four);
01085 
01086         v0 = vld1q_f32_aligned_postincrement( vv );
01087         v1 = vld1q_f32_aligned_postincrement( vv );
01088         v2 = vld1q_f32_aligned_postincrement( vv );
01089         v3 = vld1q_f32_aligned_postincrement( vv );
01090         
01091         // the next two lines should resolve to a single vswp d, d
01092         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01093         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01094         // the next two lines should resolve to a single vswp d, d
01095         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01096         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01097         
01098         xy0 = vmulq_f32(xy0, vLo);
01099         xy1 = vmulq_f32(xy1, vLo);
01100         
01101         zb = vuzpq_f32( z0, z1);
01102         z = vmulq_f32( zb.val[0], vHi);
01103         xy = vuzpq_f32( xy0, xy1);
01104         x = vaddq_f32(xy.val[0], xy.val[1]);
01105         x = vaddq_f32(x, z);
01106         
01107         mask = vcgtq_f32(x, maxDot);
01108         maxDot = vbslq_f32( mask, x, maxDot);
01109         index = vbslq_u32(mask, local_index, index);
01110         local_index = vaddq_u32(local_index, four);
01111     }
01112 
01113     for( ; i + 4 <= count; i += 4 )
01114     {
01115         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01116         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01117         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01118         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01119 
01120         // the next two lines should resolve to a single vswp d, d
01121         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01122         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01123         // the next two lines should resolve to a single vswp d, d
01124         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01125         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01126         
01127         xy0 = vmulq_f32(xy0, vLo);
01128         xy1 = vmulq_f32(xy1, vLo);
01129         
01130         float32x4x2_t zb = vuzpq_f32( z0, z1);
01131         float32x4_t z = vmulq_f32( zb.val[0], vHi);
01132         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01133         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01134         x = vaddq_f32(x, z);
01135         
01136         uint32x4_t mask = vcgtq_f32(x, maxDot);
01137         maxDot = vbslq_f32( mask, x, maxDot);
01138         index = vbslq_u32(mask, local_index, index);
01139         local_index = vaddq_u32(local_index, four);
01140     }
01141     
01142     switch (count & 3) {
01143         case 3:
01144         {
01145             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01146             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01147             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01148             
01149             // the next two lines should resolve to a single vswp d, d
01150             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01151             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
01152             // the next two lines should resolve to a single vswp d, d
01153             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01154             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
01155             
01156             xy0 = vmulq_f32(xy0, vLo);
01157             xy1 = vmulq_f32(xy1, vLo);
01158             
01159             float32x4x2_t zb = vuzpq_f32( z0, z1);
01160             float32x4_t z = vmulq_f32( zb.val[0], vHi);
01161             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01162             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01163             x = vaddq_f32(x, z);
01164             
01165             uint32x4_t mask = vcgtq_f32(x, maxDot);
01166             maxDot = vbslq_f32( mask, x, maxDot);
01167             index = vbslq_u32(mask, local_index, index);
01168             local_index = vaddq_u32(local_index, four);
01169         }
01170             break;
01171 
01172         case 2:
01173         {
01174             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01175             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01176             
01177             // the next two lines should resolve to a single vswp d, d
01178             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01179             // the next two lines should resolve to a single vswp d, d
01180             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01181             
01182             xy0 = vmulq_f32(xy0, vLo);
01183             
01184             float32x4x2_t zb = vuzpq_f32( z0, z0);
01185             float32x4_t z = vmulq_f32( zb.val[0], vHi);
01186             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01187             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01188             x = vaddq_f32(x, z);
01189             
01190             uint32x4_t mask = vcgtq_f32(x, maxDot);
01191             maxDot = vbslq_f32( mask, x, maxDot);
01192             index = vbslq_u32(mask, local_index, index);
01193             local_index = vaddq_u32(local_index, four);
01194         }
01195             break;
01196 
01197         case 1:
01198         {
01199             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01200             
01201             // the next two lines should resolve to a single vswp d, d
01202             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
01203             // the next two lines should resolve to a single vswp d, d
01204             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
01205             
01206             xy0 = vmulq_f32(xy0, vLo);
01207             
01208             z = vmulq_f32( z, vHi);
01209             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01210             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01211             x = vaddq_f32(x, z);
01212             
01213             uint32x4_t mask = vcgtq_f32(x, maxDot);
01214             maxDot = vbslq_f32( mask, x, maxDot);
01215             index = vbslq_u32(mask, local_index, index);
01216             local_index = vaddq_u32(local_index, four);
01217         }
01218             break;
01219 
01220         default:
01221             break;
01222     }
01223     
01224     
01225     // select best answer between hi and lo results
01226     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
01227     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
01228     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
01229     
01230     // select best answer between even and odd results
01231     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
01232     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
01233     mask = vcgt_f32( maxDotO, maxDot2 );
01234     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
01235     index2 = vbsl_u32(mask, indexHi, index2);
01236     
01237     *dotResult = vget_lane_f32( maxDot2, 0);
01238     return vget_lane_u32(index2, 0);
01239     
01240 }
01241 
01242 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
01243 {
01244     unsigned long i = 0;
01245     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01246     float32x2_t vLo = vget_low_f32(vvec);
01247     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
01248     float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
01249     float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
01250     uint32x2_t indexLo = (uint32x2_t) {0, 1};
01251     uint32x2_t indexHi = (uint32x2_t) {2, 3};
01252     uint32x2_t iLo = (uint32x2_t) {-1, -1};
01253     uint32x2_t iHi = (uint32x2_t) {-1, -1};
01254     const uint32x2_t four = (uint32x2_t) {4,4};
01255     
01256     for( ; i+8 <= count; i+= 8 )
01257     {
01258         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01259         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01260         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01261         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01262         
01263         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01264         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01265         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01266         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
01267         
01268         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01269         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01270         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01271         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
01272         
01273         float32x2_t rLo = vpadd_f32( xy0, xy1);
01274         float32x2_t rHi = vpadd_f32( xy2, xy3);
01275         rLo = vadd_f32(rLo, zLo);
01276         rHi = vadd_f32(rHi, zHi);
01277         
01278         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01279         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01280         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01281         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01282         iLo = vbsl_u32(maskLo, indexLo, iLo);
01283         iHi = vbsl_u32(maskHi, indexHi, iHi);
01284         indexLo = vadd_u32(indexLo, four);
01285         indexHi = vadd_u32(indexHi, four);
01286         
01287         v0 = vld1q_f32_aligned_postincrement( vv );
01288         v1 = vld1q_f32_aligned_postincrement( vv );
01289         v2 = vld1q_f32_aligned_postincrement( vv );
01290         v3 = vld1q_f32_aligned_postincrement( vv );
01291         
01292         xy0 = vmul_f32( vget_low_f32(v0), vLo);
01293         xy1 = vmul_f32( vget_low_f32(v1), vLo);
01294         xy2 = vmul_f32( vget_low_f32(v2), vLo);
01295         xy3 = vmul_f32( vget_low_f32(v3), vLo);
01296         
01297         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01298         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01299         zLo = vmul_f32( z0.val[0], vHi);
01300         zHi = vmul_f32( z1.val[0], vHi);
01301         
01302         rLo = vpadd_f32( xy0, xy1);
01303         rHi = vpadd_f32( xy2, xy3);
01304         rLo = vadd_f32(rLo, zLo);
01305         rHi = vadd_f32(rHi, zHi);
01306         
01307         maskLo = vclt_f32( rLo, dotMinLo );
01308         maskHi = vclt_f32( rHi, dotMinHi );
01309         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01310         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01311         iLo = vbsl_u32(maskLo, indexLo, iLo);
01312         iHi = vbsl_u32(maskHi, indexHi, iHi);
01313         indexLo = vadd_u32(indexLo, four);
01314         indexHi = vadd_u32(indexHi, four);
01315     }
01316 
01317     for( ; i+4 <= count; i+= 4 )
01318     {
01319         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01320         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01321         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01322         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01323         
01324         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01325         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01326         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01327         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
01328         
01329         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01330         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01331         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01332         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
01333         
01334         float32x2_t rLo = vpadd_f32( xy0, xy1);
01335         float32x2_t rHi = vpadd_f32( xy2, xy3);
01336         rLo = vadd_f32(rLo, zLo);
01337         rHi = vadd_f32(rHi, zHi);
01338         
01339         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01340         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01341         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01342         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01343         iLo = vbsl_u32(maskLo, indexLo, iLo);
01344         iHi = vbsl_u32(maskHi, indexHi, iHi);
01345         indexLo = vadd_u32(indexLo, four);
01346         indexHi = vadd_u32(indexHi, four);
01347     }
01348     switch( count & 3 )
01349     {
01350         case 3:
01351         {
01352             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01353             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01354             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01355             
01356             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01357             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01358             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01359             
01360             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01361             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01362             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
01363             
01364             float32x2_t rLo = vpadd_f32( xy0, xy1);
01365             float32x2_t rHi = vpadd_f32( xy2, xy2);
01366             rLo = vadd_f32(rLo, zLo);
01367             rHi = vadd_f32(rHi, zHi);
01368             
01369             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01370             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01371             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01372             dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01373             iLo = vbsl_u32(maskLo, indexLo, iLo);
01374             iHi = vbsl_u32(maskHi, indexHi, iHi);
01375         }
01376             break;
01377         case 2:
01378         {
01379             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01380             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01381             
01382             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01383             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01384             
01385             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01386             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01387             
01388             float32x2_t rLo = vpadd_f32( xy0, xy1);
01389             rLo = vadd_f32(rLo, zLo);
01390             
01391             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01392             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01393             iLo = vbsl_u32(maskLo, indexLo, iLo);
01394         }
01395             break;
01396         case 1:
01397         {
01398             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01399             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01400             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
01401             float32x2_t zLo = vmul_f32( z0, vHi);
01402             float32x2_t rLo = vpadd_f32( xy0, xy0);
01403             rLo = vadd_f32(rLo, zLo);
01404             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01405             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01406             iLo = vbsl_u32(maskLo, indexLo, iLo);
01407         }
01408             break;
01409             
01410         default:
01411             break;
01412     }
01413     
01414     // select best answer between hi and lo results
01415     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
01416     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
01417     iLo = vbsl_u32(mask, iHi, iLo);
01418     
01419     // select best answer between even and odd results
01420     dotMinHi = vdup_lane_f32(dotMinLo, 1);
01421     iHi = vdup_lane_u32(iLo, 1);
01422     mask = vclt_f32( dotMinHi, dotMinLo );
01423     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
01424     iLo = vbsl_u32(mask, iHi, iLo);
01425     
01426     *dotResult = vget_lane_f32( dotMinLo, 0);
01427     return vget_lane_u32(iLo, 0);
01428 }
01429 
01430 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
01431 {
01432     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01433     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
01434     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
01435     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
01436     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
01437     uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
01438     float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
01439     
01440     unsigned long i = 0;
01441     for( ; i + 8 <= count; i += 8 )
01442     {
01443         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01444         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01445         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01446         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01447         
01448         // the next two lines should resolve to a single vswp d, d
01449         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01450         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01451         // the next two lines should resolve to a single vswp d, d
01452         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01453         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01454         
01455         xy0 = vmulq_f32(xy0, vLo);
01456         xy1 = vmulq_f32(xy1, vLo);
01457         
01458         float32x4x2_t zb = vuzpq_f32( z0, z1);
01459         float32x4_t z = vmulq_f32( zb.val[0], vHi);
01460         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01461         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01462         x = vaddq_f32(x, z);
01463         
01464         uint32x4_t mask = vcltq_f32(x, minDot);
01465         minDot = vbslq_f32( mask, x, minDot);
01466         index = vbslq_u32(mask, local_index, index);
01467         local_index = vaddq_u32(local_index, four);
01468         
01469         v0 = vld1q_f32_aligned_postincrement( vv );
01470         v1 = vld1q_f32_aligned_postincrement( vv );
01471         v2 = vld1q_f32_aligned_postincrement( vv );
01472         v3 = vld1q_f32_aligned_postincrement( vv );
01473         
01474         // the next two lines should resolve to a single vswp d, d
01475         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01476         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01477         // the next two lines should resolve to a single vswp d, d
01478         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01479         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01480         
01481         xy0 = vmulq_f32(xy0, vLo);
01482         xy1 = vmulq_f32(xy1, vLo);
01483         
01484         zb = vuzpq_f32( z0, z1);
01485         z = vmulq_f32( zb.val[0], vHi);
01486         xy = vuzpq_f32( xy0, xy1);
01487         x = vaddq_f32(xy.val[0], xy.val[1]);
01488         x = vaddq_f32(x, z);
01489         
01490         mask = vcltq_f32(x, minDot);
01491         minDot = vbslq_f32( mask, x, minDot);
01492         index = vbslq_u32(mask, local_index, index);
01493         local_index = vaddq_u32(local_index, four);
01494     }
01495     
01496     for( ; i + 4 <= count; i += 4 )
01497     {
01498         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01499         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01500         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01501         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01502         
01503         // the next two lines should resolve to a single vswp d, d
01504         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01505         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01506         // the next two lines should resolve to a single vswp d, d
01507         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01508         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01509         
01510         xy0 = vmulq_f32(xy0, vLo);
01511         xy1 = vmulq_f32(xy1, vLo);
01512         
01513         float32x4x2_t zb = vuzpq_f32( z0, z1);
01514         float32x4_t z = vmulq_f32( zb.val[0], vHi);
01515         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01516         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01517         x = vaddq_f32(x, z);
01518         
01519         uint32x4_t mask = vcltq_f32(x, minDot);
01520         minDot = vbslq_f32( mask, x, minDot);
01521         index = vbslq_u32(mask, local_index, index);
01522         local_index = vaddq_u32(local_index, four);
01523     }
01524     
01525     switch (count & 3) {
01526         case 3:
01527         {
01528             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01529             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01530             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01531             
01532             // the next two lines should resolve to a single vswp d, d
01533             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01534             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
01535             // the next two lines should resolve to a single vswp d, d
01536             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01537             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
01538             
01539             xy0 = vmulq_f32(xy0, vLo);
01540             xy1 = vmulq_f32(xy1, vLo);
01541             
01542             float32x4x2_t zb = vuzpq_f32( z0, z1);
01543             float32x4_t z = vmulq_f32( zb.val[0], vHi);
01544             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01545             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01546             x = vaddq_f32(x, z);
01547             
01548             uint32x4_t mask = vcltq_f32(x, minDot);
01549             minDot = vbslq_f32( mask, x, minDot);
01550             index = vbslq_u32(mask, local_index, index);
01551             local_index = vaddq_u32(local_index, four);
01552         }
01553             break;
01554             
01555         case 2:
01556         {
01557             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01558             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01559             
01560             // the next two lines should resolve to a single vswp d, d
01561             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01562             // the next two lines should resolve to a single vswp d, d
01563             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01564             
01565             xy0 = vmulq_f32(xy0, vLo);
01566             
01567             float32x4x2_t zb = vuzpq_f32( z0, z0);
01568             float32x4_t z = vmulq_f32( zb.val[0], vHi);
01569             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01570             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01571             x = vaddq_f32(x, z);
01572             
01573             uint32x4_t mask = vcltq_f32(x, minDot);
01574             minDot = vbslq_f32( mask, x, minDot);
01575             index = vbslq_u32(mask, local_index, index);
01576             local_index = vaddq_u32(local_index, four);
01577         }
01578             break;
01579             
01580         case 1:
01581         {
01582             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01583             
01584             // the next two lines should resolve to a single vswp d, d
01585             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
01586             // the next two lines should resolve to a single vswp d, d
01587             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
01588             
01589             xy0 = vmulq_f32(xy0, vLo);
01590             
01591             z = vmulq_f32( z, vHi);
01592             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01593             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01594             x = vaddq_f32(x, z);
01595             
01596             uint32x4_t mask = vcltq_f32(x, minDot);
01597             minDot = vbslq_f32( mask, x, minDot);
01598             index = vbslq_u32(mask, local_index, index);
01599             local_index = vaddq_u32(local_index, four);
01600         }
01601             break;
01602             
01603         default:
01604             break;
01605     }
01606     
01607     
01608     // select best answer between hi and lo results
01609     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
01610     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
01611     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
01612     
01613     // select best answer between even and odd results
01614     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
01615     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
01616     mask = vclt_f32( minDotO, minDot2 );
01617     minDot2 = vbsl_f32(mask, minDotO, minDot2);
01618     index2 = vbsl_u32(mask, indexHi, index2);
01619     
01620     *dotResult = vget_lane_f32( minDot2, 0);
01621     return vget_lane_u32(index2, 0);
01622     
01623 }
01624 
01625 #else
01626     #error Unhandled __APPLE__ arch
01627 #endif
01628 
01629 #endif  /* __APPLE__ */
01630 
01631