00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #if defined (_WIN32) || defined (__i386__)
00019 #define BT_USE_SSE_IN_API
00020 #endif
00021
00022 #include "btVector3.h"
00023
00024 #if defined (BT_USE_SSE) || defined (BT_USE_NEON)
00025
00026 #ifdef __APPLE__
00027 #include <stdint.h>
00028 typedef float float4 __attribute__ ((vector_size(16)));
00029 #else
00030 #define float4 __m128
00031 #endif
00032
00033
00034
00035 #if defined BT_USE_SSE || defined _WIN32
00036
00037 #define LOG2_ARRAY_SIZE 6
00038 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
00039
00040 #include <emmintrin.h>
00041
00042 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
00043 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
00044 {
00045 const float4 *vertices = (const float4*) vv;
00046 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
00047 float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
00048 float4 vvec = _mm_loadu_ps( vec );
00049 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
00050 float4 vLo = _mm_movelh_ps( vvec, vvec );
00051
00052 long maxIndex = -1L;
00053
00054 size_t segment = 0;
00055 float4 stack_array[ STACK_ARRAY_COUNT ];
00056
00057 #if DEBUG
00058 memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
00059 #endif
00060
00061 size_t index;
00062 float4 max;
00063
00064 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
00065 {
00066 max = dotMax;
00067
00068 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
00069 {
00070 float4 v0 = vertices[0];
00071 float4 v1 = vertices[1];
00072 float4 v2 = vertices[2];
00073 float4 v3 = vertices[3]; vertices += 4;
00074
00075 float4 lo0 = _mm_movelh_ps( v0, v1);
00076 float4 hi0 = _mm_movehl_ps( v1, v0);
00077 float4 lo1 = _mm_movelh_ps( v2, v3);
00078 float4 hi1 = _mm_movehl_ps( v3, v2);
00079
00080 lo0 = lo0*vLo;
00081 lo1 = lo1*vLo;
00082 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00083 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00084 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00085 z = z*vHi;
00086 x = x+y;
00087 x = x+z;
00088 stack_array[index] = x;
00089 max = _mm_max_ps( x, max );
00090
00091 v0 = vertices[0];
00092 v1 = vertices[1];
00093 v2 = vertices[2];
00094 v3 = vertices[3]; vertices += 4;
00095
00096 lo0 = _mm_movelh_ps( v0, v1);
00097 hi0 = _mm_movehl_ps( v1, v0);
00098 lo1 = _mm_movelh_ps( v2, v3);
00099 hi1 = _mm_movehl_ps( v3, v2);
00100
00101 lo0 = lo0*vLo;
00102 lo1 = lo1*vLo;
00103 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00104 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00105 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00106 z = z*vHi;
00107 x = x+y;
00108 x = x+z;
00109 stack_array[index+1] = x;
00110 max = _mm_max_ps( x, max );
00111
00112 v0 = vertices[0];
00113 v1 = vertices[1];
00114 v2 = vertices[2];
00115 v3 = vertices[3]; vertices += 4;
00116
00117 lo0 = _mm_movelh_ps( v0, v1);
00118 hi0 = _mm_movehl_ps( v1, v0);
00119 lo1 = _mm_movelh_ps( v2, v3);
00120 hi1 = _mm_movehl_ps( v3, v2);
00121
00122 lo0 = lo0*vLo;
00123 lo1 = lo1*vLo;
00124 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00125 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00126 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00127 z = z*vHi;
00128 x = x+y;
00129 x = x+z;
00130 stack_array[index+2] = x;
00131 max = _mm_max_ps( x, max );
00132
00133 v0 = vertices[0];
00134 v1 = vertices[1];
00135 v2 = vertices[2];
00136 v3 = vertices[3]; vertices += 4;
00137
00138 lo0 = _mm_movelh_ps( v0, v1);
00139 hi0 = _mm_movehl_ps( v1, v0);
00140 lo1 = _mm_movelh_ps( v2, v3);
00141 hi1 = _mm_movehl_ps( v3, v2);
00142
00143 lo0 = lo0*vLo;
00144 lo1 = lo1*vLo;
00145 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00146 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00147 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00148 z = z*vHi;
00149 x = x+y;
00150 x = x+z;
00151 stack_array[index+3] = x;
00152 max = _mm_max_ps( x, max );
00153
00154
00155 }
00156
00157
00158 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
00159 {
00160
00161 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
00162 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
00163
00164 dotMax = max;
00165
00166
00167 size_t test;
00168 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
00169 {}
00170
00171 maxIndex = 4*index + segment + indexTable[test];
00172 }
00173 }
00174
00175
00176 count -= segment;
00177
00178
00179 max = dotMax;
00180 index = 0;
00181
00182
00183 if( btUnlikely( count > 16) )
00184 {
00185 for( ; index + 4 <= count / 4; index+=4 )
00186 {
00187 float4 v0 = vertices[0];
00188 float4 v1 = vertices[1];
00189 float4 v2 = vertices[2];
00190 float4 v3 = vertices[3]; vertices += 4;
00191
00192 float4 lo0 = _mm_movelh_ps( v0, v1);
00193 float4 hi0 = _mm_movehl_ps( v1, v0);
00194 float4 lo1 = _mm_movelh_ps( v2, v3);
00195 float4 hi1 = _mm_movehl_ps( v3, v2);
00196
00197 lo0 = lo0*vLo;
00198 lo1 = lo1*vLo;
00199 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00200 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00201 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00202 z = z*vHi;
00203 x = x+y;
00204 x = x+z;
00205 stack_array[index] = x;
00206 max = _mm_max_ps( x, max );
00207
00208 v0 = vertices[0];
00209 v1 = vertices[1];
00210 v2 = vertices[2];
00211 v3 = vertices[3]; vertices += 4;
00212
00213 lo0 = _mm_movelh_ps( v0, v1);
00214 hi0 = _mm_movehl_ps( v1, v0);
00215 lo1 = _mm_movelh_ps( v2, v3);
00216 hi1 = _mm_movehl_ps( v3, v2);
00217
00218 lo0 = lo0*vLo;
00219 lo1 = lo1*vLo;
00220 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00221 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00222 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00223 z = z*vHi;
00224 x = x+y;
00225 x = x+z;
00226 stack_array[index+1] = x;
00227 max = _mm_max_ps( x, max );
00228
00229 v0 = vertices[0];
00230 v1 = vertices[1];
00231 v2 = vertices[2];
00232 v3 = vertices[3]; vertices += 4;
00233
00234 lo0 = _mm_movelh_ps( v0, v1);
00235 hi0 = _mm_movehl_ps( v1, v0);
00236 lo1 = _mm_movelh_ps( v2, v3);
00237 hi1 = _mm_movehl_ps( v3, v2);
00238
00239 lo0 = lo0*vLo;
00240 lo1 = lo1*vLo;
00241 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00242 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00243 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00244 z = z*vHi;
00245 x = x+y;
00246 x = x+z;
00247 stack_array[index+2] = x;
00248 max = _mm_max_ps( x, max );
00249
00250 v0 = vertices[0];
00251 v1 = vertices[1];
00252 v2 = vertices[2];
00253 v3 = vertices[3]; vertices += 4;
00254
00255 lo0 = _mm_movelh_ps( v0, v1);
00256 hi0 = _mm_movehl_ps( v1, v0);
00257 lo1 = _mm_movelh_ps( v2, v3);
00258 hi1 = _mm_movehl_ps( v3, v2);
00259
00260 lo0 = lo0*vLo;
00261 lo1 = lo1*vLo;
00262 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00263 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00264 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00265 z = z*vHi;
00266 x = x+y;
00267 x = x+z;
00268 stack_array[index+3] = x;
00269 max = _mm_max_ps( x, max );
00270
00271
00272 }
00273 }
00274
00275 size_t localCount = (count & -4L) - 4*index;
00276 if( localCount )
00277 {
00278 #ifdef __APPLE__
00279 float4 t0, t1, t2, t3, t4;
00280 float4 * sap = &stack_array[index + localCount / 4];
00281 vertices += localCount;
00282 size_t byteIndex = -(localCount) * sizeof(float);
00283
00284 asm volatile
00285 ( ".align 4 \n\
00286 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
00287 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
00288 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
00289 movaps %[t0], %[max] // vertices[0] \n\
00290 movlhps %[t1], %[max] // x0y0x1y1 \n\
00291 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
00292 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
00293 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
00294 movhlps %[t0], %[t1] // z0w0z1w1 \n\
00295 movaps %[t3], %[t0] // vertices[2] \n\
00296 movlhps %[t4], %[t0] // x2y2x3y3 \n\
00297 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
00298 movhlps %[t3], %[t4] // z2w2z3w3 \n\
00299 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
00300 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
00301 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
00302 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
00303 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
00304 addps %[t3], %[max] // x + y \n\
00305 addps %[t1], %[max] // x + y + z \n\
00306 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
00307 maxps %[t2], %[max] // record max, restore max \n\
00308 add $16, %[byteIndex] // advance loop counter\n\
00309 jnz 0b \n\
00310 "
00311 : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
00312 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
00313 : "memory", "cc"
00314 );
00315 index += localCount/4;
00316 #else
00317 {
00318 for( unsigned int i=0; i<localCount/4; i++,index++)
00319 {
00320 float4 v0 = vertices[0];
00321 float4 v1 = vertices[1];
00322 float4 v2 = vertices[2];
00323 float4 v3 = vertices[3];
00324 vertices += 4;
00325
00326 float4 lo0 = _mm_movelh_ps( v0, v1);
00327 float4 hi0 = _mm_movehl_ps( v1, v0);
00328 float4 lo1 = _mm_movelh_ps( v2, v3);
00329 float4 hi1 = _mm_movehl_ps( v3, v2);
00330
00331 lo0 = lo0*vLo;
00332 lo1 = lo1*vLo;
00333 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00334 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00335 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00336 z = z*vHi;
00337 x = x+y;
00338 x = x+z;
00339 stack_array[index] = x;
00340 max = _mm_max_ps( x, max );
00341 }
00342 }
00343 #endif //__APPLE__
00344 }
00345
00346
00347 if( count & 3 )
00348 {
00349 float4 v0, v1, v2, x, y, z;
00350 switch( count & 3 )
00351 {
00352 case 3:
00353 {
00354 v0 = vertices[0];
00355 v1 = vertices[1];
00356 v2 = vertices[2];
00357
00358
00359 float4 lo0 = _mm_movelh_ps( v0, v1);
00360 float4 hi0 = _mm_movehl_ps( v1, v0);
00361 lo0 = lo0*vLo;
00362 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
00363 z = z*vHi;
00364 float4 lo1 = _mm_movelh_ps(v2, v2);
00365 lo1 = lo1*vLo;
00366 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00367 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00368 }
00369 break;
00370 case 2:
00371 {
00372 v0 = vertices[0];
00373 v1 = vertices[1];
00374 float4 xy = _mm_movelh_ps(v0, v1);
00375 z = _mm_movehl_ps(v1, v0);
00376 xy = xy*vLo;
00377 z = _mm_shuffle_ps( z, z, 0xa8);
00378 x = _mm_shuffle_ps( xy, xy, 0xa8);
00379 y = _mm_shuffle_ps( xy, xy, 0xfd);
00380 z = z*vHi;
00381 }
00382 break;
00383 case 1:
00384 {
00385 float4 xy = vertices[0];
00386 z = _mm_shuffle_ps( xy, xy, 0xaa);
00387 xy = xy*vLo;
00388 z = z*vHi;
00389 x = _mm_shuffle_ps(xy, xy, 0);
00390 y = _mm_shuffle_ps(xy, xy, 0x55);
00391 }
00392 break;
00393 }
00394 x = x+y;
00395 x = x+z;
00396 stack_array[index] = x;
00397 max = _mm_max_ps( x, max );
00398 index++;
00399 }
00400
00401
00402 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
00403 {
00404
00405 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
00406 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
00407
00408
00409
00410
00411
00412
00413 dotMax = max;
00414
00415
00416 size_t test;
00417 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
00418 {}
00419 maxIndex = 4*index + segment + indexTable[test];
00420 }
00421
00422 _mm_store_ss( dotResult, dotMax);
00423 return maxIndex;
00424 }
00425
00426 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
00427
00428 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
00429 {
00430 const float4 *vertices = (const float4*) vv;
00431 static const unsigned char indexTable[16] = {-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
00432 float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
00433 float4 vvec = _mm_loadu_ps( vec );
00434 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
00435 float4 vLo = _mm_movelh_ps( vvec, vvec );
00436
00437 long minIndex = -1L;
00438
00439 size_t segment = 0;
00440 float4 stack_array[ STACK_ARRAY_COUNT ];
00441
00442 #if DEBUG
00443 memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
00444 #endif
00445
00446 size_t index;
00447 float4 min;
00448
00449 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
00450 {
00451 min = dotmin;
00452
00453 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
00454 {
00455 float4 v0 = vertices[0];
00456 float4 v1 = vertices[1];
00457 float4 v2 = vertices[2];
00458 float4 v3 = vertices[3]; vertices += 4;
00459
00460 float4 lo0 = _mm_movelh_ps( v0, v1);
00461 float4 hi0 = _mm_movehl_ps( v1, v0);
00462 float4 lo1 = _mm_movelh_ps( v2, v3);
00463 float4 hi1 = _mm_movehl_ps( v3, v2);
00464
00465 lo0 = lo0*vLo;
00466 lo1 = lo1*vLo;
00467 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00468 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00469 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00470 z = z*vHi;
00471 x = x+y;
00472 x = x+z;
00473 stack_array[index] = x;
00474 min = _mm_min_ps( x, min );
00475
00476 v0 = vertices[0];
00477 v1 = vertices[1];
00478 v2 = vertices[2];
00479 v3 = vertices[3]; vertices += 4;
00480
00481 lo0 = _mm_movelh_ps( v0, v1);
00482 hi0 = _mm_movehl_ps( v1, v0);
00483 lo1 = _mm_movelh_ps( v2, v3);
00484 hi1 = _mm_movehl_ps( v3, v2);
00485
00486 lo0 = lo0*vLo;
00487 lo1 = lo1*vLo;
00488 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00489 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00490 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00491 z = z*vHi;
00492 x = x+y;
00493 x = x+z;
00494 stack_array[index+1] = x;
00495 min = _mm_min_ps( x, min );
00496
00497 v0 = vertices[0];
00498 v1 = vertices[1];
00499 v2 = vertices[2];
00500 v3 = vertices[3]; vertices += 4;
00501
00502 lo0 = _mm_movelh_ps( v0, v1);
00503 hi0 = _mm_movehl_ps( v1, v0);
00504 lo1 = _mm_movelh_ps( v2, v3);
00505 hi1 = _mm_movehl_ps( v3, v2);
00506
00507 lo0 = lo0*vLo;
00508 lo1 = lo1*vLo;
00509 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00510 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00511 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00512 z = z*vHi;
00513 x = x+y;
00514 x = x+z;
00515 stack_array[index+2] = x;
00516 min = _mm_min_ps( x, min );
00517
00518 v0 = vertices[0];
00519 v1 = vertices[1];
00520 v2 = vertices[2];
00521 v3 = vertices[3]; vertices += 4;
00522
00523 lo0 = _mm_movelh_ps( v0, v1);
00524 hi0 = _mm_movehl_ps( v1, v0);
00525 lo1 = _mm_movelh_ps( v2, v3);
00526 hi1 = _mm_movehl_ps( v3, v2);
00527
00528 lo0 = lo0*vLo;
00529 lo1 = lo1*vLo;
00530 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00531 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00532 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00533 z = z*vHi;
00534 x = x+y;
00535 x = x+z;
00536 stack_array[index+3] = x;
00537 min = _mm_min_ps( x, min );
00538
00539
00540 }
00541
00542
00543 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
00544 {
00545
00546 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
00547 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
00548
00549 dotmin = min;
00550
00551
00552 size_t test;
00553 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
00554 {}
00555
00556 minIndex = 4*index + segment + indexTable[test];
00557 }
00558 }
00559
00560
00561 count -= segment;
00562
00563
00564 min = dotmin;
00565 index = 0;
00566
00567
00568 if(btUnlikely( count > 16) )
00569 {
00570 for( ; index + 4 <= count / 4; index+=4 )
00571 {
00572 float4 v0 = vertices[0];
00573 float4 v1 = vertices[1];
00574 float4 v2 = vertices[2];
00575 float4 v3 = vertices[3]; vertices += 4;
00576
00577 float4 lo0 = _mm_movelh_ps( v0, v1);
00578 float4 hi0 = _mm_movehl_ps( v1, v0);
00579 float4 lo1 = _mm_movelh_ps( v2, v3);
00580 float4 hi1 = _mm_movehl_ps( v3, v2);
00581
00582 lo0 = lo0*vLo;
00583 lo1 = lo1*vLo;
00584 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00585 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00586 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00587 z = z*vHi;
00588 x = x+y;
00589 x = x+z;
00590 stack_array[index] = x;
00591 min = _mm_min_ps( x, min );
00592
00593 v0 = vertices[0];
00594 v1 = vertices[1];
00595 v2 = vertices[2];
00596 v3 = vertices[3]; vertices += 4;
00597
00598 lo0 = _mm_movelh_ps( v0, v1);
00599 hi0 = _mm_movehl_ps( v1, v0);
00600 lo1 = _mm_movelh_ps( v2, v3);
00601 hi1 = _mm_movehl_ps( v3, v2);
00602
00603 lo0 = lo0*vLo;
00604 lo1 = lo1*vLo;
00605 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00606 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00607 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00608 z = z*vHi;
00609 x = x+y;
00610 x = x+z;
00611 stack_array[index+1] = x;
00612 min = _mm_min_ps( x, min );
00613
00614 v0 = vertices[0];
00615 v1 = vertices[1];
00616 v2 = vertices[2];
00617 v3 = vertices[3]; vertices += 4;
00618
00619 lo0 = _mm_movelh_ps( v0, v1);
00620 hi0 = _mm_movehl_ps( v1, v0);
00621 lo1 = _mm_movelh_ps( v2, v3);
00622 hi1 = _mm_movehl_ps( v3, v2);
00623
00624 lo0 = lo0*vLo;
00625 lo1 = lo1*vLo;
00626 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00627 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00628 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00629 z = z*vHi;
00630 x = x+y;
00631 x = x+z;
00632 stack_array[index+2] = x;
00633 min = _mm_min_ps( x, min );
00634
00635 v0 = vertices[0];
00636 v1 = vertices[1];
00637 v2 = vertices[2];
00638 v3 = vertices[3]; vertices += 4;
00639
00640 lo0 = _mm_movelh_ps( v0, v1);
00641 hi0 = _mm_movehl_ps( v1, v0);
00642 lo1 = _mm_movelh_ps( v2, v3);
00643 hi1 = _mm_movehl_ps( v3, v2);
00644
00645 lo0 = lo0*vLo;
00646 lo1 = lo1*vLo;
00647 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00648 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00649 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00650 z = z*vHi;
00651 x = x+y;
00652 x = x+z;
00653 stack_array[index+3] = x;
00654 min = _mm_min_ps( x, min );
00655
00656
00657 }
00658 }
00659
00660 size_t localCount = (count & -4L) - 4*index;
00661 if( localCount )
00662 {
00663
00664
00665 #ifdef __APPLE__
00666 vertices += localCount;
00667 float4 t0, t1, t2, t3, t4;
00668 size_t byteIndex = -(localCount) * sizeof(float);
00669 float4 * sap = &stack_array[index + localCount / 4];
00670
00671 asm volatile
00672 ( ".align 4 \n\
00673 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
00674 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
00675 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
00676 movaps %[t0], %[min] // vertices[0] \n\
00677 movlhps %[t1], %[min] // x0y0x1y1 \n\
00678 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
00679 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
00680 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
00681 movhlps %[t0], %[t1] // z0w0z1w1 \n\
00682 movaps %[t3], %[t0] // vertices[2] \n\
00683 movlhps %[t4], %[t0] // x2y2x3y3 \n\
00684 movhlps %[t3], %[t4] // z2w2z3w3 \n\
00685 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
00686 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
00687 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
00688 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
00689 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
00690 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
00691 addps %[t3], %[min] // x + y \n\
00692 addps %[t1], %[min] // x + y + z \n\
00693 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
00694 minps %[t2], %[min] // record min, restore min \n\
00695 add $16, %[byteIndex] // advance loop counter\n\
00696 jnz 0b \n\
00697 "
00698 : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
00699 : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
00700 : "memory", "cc"
00701 );
00702 index += localCount/4;
00703 #else
00704 {
00705 for( unsigned int i=0; i<localCount/4; i++,index++)
00706 {
00707 float4 v0 = vertices[0];
00708 float4 v1 = vertices[1];
00709 float4 v2 = vertices[2];
00710 float4 v3 = vertices[3];
00711 vertices += 4;
00712
00713 float4 lo0 = _mm_movelh_ps( v0, v1);
00714 float4 hi0 = _mm_movehl_ps( v1, v0);
00715 float4 lo1 = _mm_movelh_ps( v2, v3);
00716 float4 hi1 = _mm_movehl_ps( v3, v2);
00717
00718 lo0 = lo0*vLo;
00719 lo1 = lo1*vLo;
00720 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
00721 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00722 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00723 z = z*vHi;
00724 x = x+y;
00725 x = x+z;
00726 stack_array[index] = x;
00727 min = _mm_min_ps( x, min );
00728 }
00729 }
00730
00731 #endif
00732 }
00733
00734
00735 if( count & 3 )
00736 {
00737 float4 v0, v1, v2, x, y, z;
00738 switch( count & 3 )
00739 {
00740 case 3:
00741 {
00742 v0 = vertices[0];
00743 v1 = vertices[1];
00744 v2 = vertices[2];
00745
00746
00747 float4 lo0 = _mm_movelh_ps( v0, v1);
00748 float4 hi0 = _mm_movehl_ps( v1, v0);
00749 lo0 = lo0*vLo;
00750 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
00751 z = z*vHi;
00752 float4 lo1 = _mm_movelh_ps(v2, v2);
00753 lo1 = lo1*vLo;
00754 x = _mm_shuffle_ps(lo0, lo1, 0x88);
00755 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
00756 }
00757 break;
00758 case 2:
00759 {
00760 v0 = vertices[0];
00761 v1 = vertices[1];
00762 float4 xy = _mm_movelh_ps(v0, v1);
00763 z = _mm_movehl_ps(v1, v0);
00764 xy = xy*vLo;
00765 z = _mm_shuffle_ps( z, z, 0xa8);
00766 x = _mm_shuffle_ps( xy, xy, 0xa8);
00767 y = _mm_shuffle_ps( xy, xy, 0xfd);
00768 z = z*vHi;
00769 }
00770 break;
00771 case 1:
00772 {
00773 float4 xy = vertices[0];
00774 z = _mm_shuffle_ps( xy, xy, 0xaa);
00775 xy = xy*vLo;
00776 z = z*vHi;
00777 x = _mm_shuffle_ps(xy, xy, 0);
00778 y = _mm_shuffle_ps(xy, xy, 0x55);
00779 }
00780 break;
00781 }
00782 x = x+y;
00783 x = x+z;
00784 stack_array[index] = x;
00785 min = _mm_min_ps( x, min );
00786 index++;
00787 }
00788
00789
00790 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
00791 {
00792
00793 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
00794 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
00795
00796
00797
00798
00799
00800
00801 dotmin = min;
00802
00803
00804 size_t test;
00805 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
00806 {}
00807 minIndex = 4*index + segment + indexTable[test];
00808 }
00809
00810 _mm_store_ss( dotResult, dotmin);
00811 return minIndex;
00812 }
00813
00814
00815 #elif defined BT_USE_NEON
00816 #define ARM_NEON_GCC_COMPATIBILITY 1
00817 #include <arm_neon.h>
00818
00819
00820 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
00821 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
00822 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
00823 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
00824 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
00825 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
00826
00827 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
00828 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
00829
00830 extern "C" {int _get_cpu_capabilities( void );}
00831
00832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
00833 {
00834 if( _get_cpu_capabilities() & 0x2000 )
00835 _maxdot_large = _maxdot_large_v1;
00836 else
00837 _maxdot_large = _maxdot_large_v0;
00838
00839 return _maxdot_large(vv, vec, count, dotResult);
00840 }
00841
00842 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
00843 {
00844 if( _get_cpu_capabilities() & 0x2000 )
00845 _mindot_large = _mindot_large_v1;
00846 else
00847 _mindot_large = _mindot_large_v0;
00848
00849 return _mindot_large(vv, vec, count, dotResult);
00850 }
00851
00852
00853
00854 #define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; })
00855
00856
00857 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
00858 {
00859 unsigned long i = 0;
00860 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
00861 float32x2_t vLo = vget_low_f32(vvec);
00862 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
00863 float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
00864 float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
00865 uint32x2_t indexLo = (uint32x2_t) {0, 1};
00866 uint32x2_t indexHi = (uint32x2_t) {2, 3};
00867 uint32x2_t iLo = (uint32x2_t) {-1, -1};
00868 uint32x2_t iHi = (uint32x2_t) {-1, -1};
00869 const uint32x2_t four = (uint32x2_t) {4,4};
00870
00871 for( ; i+8 <= count; i+= 8 )
00872 {
00873 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00874 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00875 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00876 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
00877
00878 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00879 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00880 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00881 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
00882
00883 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00884 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00885 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00886 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
00887
00888 float32x2_t rLo = vpadd_f32( xy0, xy1);
00889 float32x2_t rHi = vpadd_f32( xy2, xy3);
00890 rLo = vadd_f32(rLo, zLo);
00891 rHi = vadd_f32(rHi, zHi);
00892
00893 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00894 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00895 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00896 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00897 iLo = vbsl_u32(maskLo, indexLo, iLo);
00898 iHi = vbsl_u32(maskHi, indexHi, iHi);
00899 indexLo = vadd_u32(indexLo, four);
00900 indexHi = vadd_u32(indexHi, four);
00901
00902 v0 = vld1q_f32_aligned_postincrement( vv );
00903 v1 = vld1q_f32_aligned_postincrement( vv );
00904 v2 = vld1q_f32_aligned_postincrement( vv );
00905 v3 = vld1q_f32_aligned_postincrement( vv );
00906
00907 xy0 = vmul_f32( vget_low_f32(v0), vLo);
00908 xy1 = vmul_f32( vget_low_f32(v1), vLo);
00909 xy2 = vmul_f32( vget_low_f32(v2), vLo);
00910 xy3 = vmul_f32( vget_low_f32(v3), vLo);
00911
00912 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00913 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00914 zLo = vmul_f32( z0.val[0], vHi);
00915 zHi = vmul_f32( z1.val[0], vHi);
00916
00917 rLo = vpadd_f32( xy0, xy1);
00918 rHi = vpadd_f32( xy2, xy3);
00919 rLo = vadd_f32(rLo, zLo);
00920 rHi = vadd_f32(rHi, zHi);
00921
00922 maskLo = vcgt_f32( rLo, dotMaxLo );
00923 maskHi = vcgt_f32( rHi, dotMaxHi );
00924 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00925 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00926 iLo = vbsl_u32(maskLo, indexLo, iLo);
00927 iHi = vbsl_u32(maskHi, indexHi, iHi);
00928 indexLo = vadd_u32(indexLo, four);
00929 indexHi = vadd_u32(indexHi, four);
00930 }
00931
00932 for( ; i+4 <= count; i+= 4 )
00933 {
00934 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00935 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00936 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00937 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
00938
00939 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00940 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00941 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00942 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
00943
00944 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00945 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
00946 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00947 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
00948
00949 float32x2_t rLo = vpadd_f32( xy0, xy1);
00950 float32x2_t rHi = vpadd_f32( xy2, xy3);
00951 rLo = vadd_f32(rLo, zLo);
00952 rHi = vadd_f32(rHi, zHi);
00953
00954 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00955 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00956 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00957 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00958 iLo = vbsl_u32(maskLo, indexLo, iLo);
00959 iHi = vbsl_u32(maskHi, indexHi, iHi);
00960 indexLo = vadd_u32(indexLo, four);
00961 indexHi = vadd_u32(indexHi, four);
00962 }
00963
00964 switch( count & 3 )
00965 {
00966 case 3:
00967 {
00968 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00969 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00970 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
00971
00972 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00973 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
00974 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
00975
00976 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
00977 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
00978 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
00979
00980 float32x2_t rLo = vpadd_f32( xy0, xy1);
00981 float32x2_t rHi = vpadd_f32( xy2, xy2);
00982 rLo = vadd_f32(rLo, zLo);
00983 rHi = vadd_f32(rHi, zHi);
00984
00985 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
00986 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
00987 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
00988 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
00989 iLo = vbsl_u32(maskLo, indexLo, iLo);
00990 iHi = vbsl_u32(maskHi, indexHi, iHi);
00991 }
00992 break;
00993 case 2:
00994 {
00995 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
00996 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
00997
00998 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
00999 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01000
01001 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01002 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01003
01004 float32x2_t rLo = vpadd_f32( xy0, xy1);
01005 rLo = vadd_f32(rLo, zLo);
01006
01007 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
01008 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
01009 iLo = vbsl_u32(maskLo, indexLo, iLo);
01010 }
01011 break;
01012 case 1:
01013 {
01014 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01015 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01016 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
01017 float32x2_t zLo = vmul_f32( z0, vHi);
01018 float32x2_t rLo = vpadd_f32( xy0, xy0);
01019 rLo = vadd_f32(rLo, zLo);
01020 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
01021 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
01022 iLo = vbsl_u32(maskLo, indexLo, iLo);
01023 }
01024 break;
01025
01026 default:
01027 break;
01028 }
01029
01030
01031 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
01032 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
01033 iLo = vbsl_u32(mask, iHi, iLo);
01034
01035
01036 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
01037 iHi = vdup_lane_u32(iLo, 1);
01038 mask = vcgt_f32( dotMaxHi, dotMaxLo );
01039 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
01040 iLo = vbsl_u32(mask, iHi, iLo);
01041
01042 *dotResult = vget_lane_f32( dotMaxLo, 0);
01043 return vget_lane_u32(iLo, 0);
01044 }
01045
01046
01047 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
01048 {
01049 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01050 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
01051 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
01052 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
01053 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
01054 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
01055 float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
01056
01057 unsigned long i = 0;
01058 for( ; i + 8 <= count; i += 8 )
01059 {
01060 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01061 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01062 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01063 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01064
01065
01066 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01067 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01068
01069 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01070 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01071
01072 xy0 = vmulq_f32(xy0, vLo);
01073 xy1 = vmulq_f32(xy1, vLo);
01074
01075 float32x4x2_t zb = vuzpq_f32( z0, z1);
01076 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01077 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01078 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01079 x = vaddq_f32(x, z);
01080
01081 uint32x4_t mask = vcgtq_f32(x, maxDot);
01082 maxDot = vbslq_f32( mask, x, maxDot);
01083 index = vbslq_u32(mask, local_index, index);
01084 local_index = vaddq_u32(local_index, four);
01085
01086 v0 = vld1q_f32_aligned_postincrement( vv );
01087 v1 = vld1q_f32_aligned_postincrement( vv );
01088 v2 = vld1q_f32_aligned_postincrement( vv );
01089 v3 = vld1q_f32_aligned_postincrement( vv );
01090
01091
01092 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01093 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01094
01095 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01096 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01097
01098 xy0 = vmulq_f32(xy0, vLo);
01099 xy1 = vmulq_f32(xy1, vLo);
01100
01101 zb = vuzpq_f32( z0, z1);
01102 z = vmulq_f32( zb.val[0], vHi);
01103 xy = vuzpq_f32( xy0, xy1);
01104 x = vaddq_f32(xy.val[0], xy.val[1]);
01105 x = vaddq_f32(x, z);
01106
01107 mask = vcgtq_f32(x, maxDot);
01108 maxDot = vbslq_f32( mask, x, maxDot);
01109 index = vbslq_u32(mask, local_index, index);
01110 local_index = vaddq_u32(local_index, four);
01111 }
01112
01113 for( ; i + 4 <= count; i += 4 )
01114 {
01115 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01116 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01117 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01118 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01119
01120
01121 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01122 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01123
01124 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01125 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01126
01127 xy0 = vmulq_f32(xy0, vLo);
01128 xy1 = vmulq_f32(xy1, vLo);
01129
01130 float32x4x2_t zb = vuzpq_f32( z0, z1);
01131 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01132 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01133 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01134 x = vaddq_f32(x, z);
01135
01136 uint32x4_t mask = vcgtq_f32(x, maxDot);
01137 maxDot = vbslq_f32( mask, x, maxDot);
01138 index = vbslq_u32(mask, local_index, index);
01139 local_index = vaddq_u32(local_index, four);
01140 }
01141
01142 switch (count & 3) {
01143 case 3:
01144 {
01145 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01146 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01147 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01148
01149
01150 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01151 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
01152
01153 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01154 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
01155
01156 xy0 = vmulq_f32(xy0, vLo);
01157 xy1 = vmulq_f32(xy1, vLo);
01158
01159 float32x4x2_t zb = vuzpq_f32( z0, z1);
01160 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01161 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01162 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01163 x = vaddq_f32(x, z);
01164
01165 uint32x4_t mask = vcgtq_f32(x, maxDot);
01166 maxDot = vbslq_f32( mask, x, maxDot);
01167 index = vbslq_u32(mask, local_index, index);
01168 local_index = vaddq_u32(local_index, four);
01169 }
01170 break;
01171
01172 case 2:
01173 {
01174 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01175 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01176
01177
01178 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01179
01180 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01181
01182 xy0 = vmulq_f32(xy0, vLo);
01183
01184 float32x4x2_t zb = vuzpq_f32( z0, z0);
01185 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01186 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01187 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01188 x = vaddq_f32(x, z);
01189
01190 uint32x4_t mask = vcgtq_f32(x, maxDot);
01191 maxDot = vbslq_f32( mask, x, maxDot);
01192 index = vbslq_u32(mask, local_index, index);
01193 local_index = vaddq_u32(local_index, four);
01194 }
01195 break;
01196
01197 case 1:
01198 {
01199 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01200
01201
01202 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
01203
01204 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
01205
01206 xy0 = vmulq_f32(xy0, vLo);
01207
01208 z = vmulq_f32( z, vHi);
01209 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01210 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01211 x = vaddq_f32(x, z);
01212
01213 uint32x4_t mask = vcgtq_f32(x, maxDot);
01214 maxDot = vbslq_f32( mask, x, maxDot);
01215 index = vbslq_u32(mask, local_index, index);
01216 local_index = vaddq_u32(local_index, four);
01217 }
01218 break;
01219
01220 default:
01221 break;
01222 }
01223
01224
01225
01226 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
01227 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
01228 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
01229
01230
01231 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
01232 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
01233 mask = vcgt_f32( maxDotO, maxDot2 );
01234 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
01235 index2 = vbsl_u32(mask, indexHi, index2);
01236
01237 *dotResult = vget_lane_f32( maxDot2, 0);
01238 return vget_lane_u32(index2, 0);
01239
01240 }
01241
01242 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
01243 {
01244 unsigned long i = 0;
01245 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01246 float32x2_t vLo = vget_low_f32(vvec);
01247 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
01248 float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
01249 float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
01250 uint32x2_t indexLo = (uint32x2_t) {0, 1};
01251 uint32x2_t indexHi = (uint32x2_t) {2, 3};
01252 uint32x2_t iLo = (uint32x2_t) {-1, -1};
01253 uint32x2_t iHi = (uint32x2_t) {-1, -1};
01254 const uint32x2_t four = (uint32x2_t) {4,4};
01255
01256 for( ; i+8 <= count; i+= 8 )
01257 {
01258 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01259 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01260 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01261 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01262
01263 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01264 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01265 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01266 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
01267
01268 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01269 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01270 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01271 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
01272
01273 float32x2_t rLo = vpadd_f32( xy0, xy1);
01274 float32x2_t rHi = vpadd_f32( xy2, xy3);
01275 rLo = vadd_f32(rLo, zLo);
01276 rHi = vadd_f32(rHi, zHi);
01277
01278 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01279 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01280 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01281 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01282 iLo = vbsl_u32(maskLo, indexLo, iLo);
01283 iHi = vbsl_u32(maskHi, indexHi, iHi);
01284 indexLo = vadd_u32(indexLo, four);
01285 indexHi = vadd_u32(indexHi, four);
01286
01287 v0 = vld1q_f32_aligned_postincrement( vv );
01288 v1 = vld1q_f32_aligned_postincrement( vv );
01289 v2 = vld1q_f32_aligned_postincrement( vv );
01290 v3 = vld1q_f32_aligned_postincrement( vv );
01291
01292 xy0 = vmul_f32( vget_low_f32(v0), vLo);
01293 xy1 = vmul_f32( vget_low_f32(v1), vLo);
01294 xy2 = vmul_f32( vget_low_f32(v2), vLo);
01295 xy3 = vmul_f32( vget_low_f32(v3), vLo);
01296
01297 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01298 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01299 zLo = vmul_f32( z0.val[0], vHi);
01300 zHi = vmul_f32( z1.val[0], vHi);
01301
01302 rLo = vpadd_f32( xy0, xy1);
01303 rHi = vpadd_f32( xy2, xy3);
01304 rLo = vadd_f32(rLo, zLo);
01305 rHi = vadd_f32(rHi, zHi);
01306
01307 maskLo = vclt_f32( rLo, dotMinLo );
01308 maskHi = vclt_f32( rHi, dotMinHi );
01309 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01310 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01311 iLo = vbsl_u32(maskLo, indexLo, iLo);
01312 iHi = vbsl_u32(maskHi, indexHi, iHi);
01313 indexLo = vadd_u32(indexLo, four);
01314 indexHi = vadd_u32(indexHi, four);
01315 }
01316
01317 for( ; i+4 <= count; i+= 4 )
01318 {
01319 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01320 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01321 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01322 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01323
01324 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01325 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01326 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01327 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
01328
01329 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01330 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
01331 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01332 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
01333
01334 float32x2_t rLo = vpadd_f32( xy0, xy1);
01335 float32x2_t rHi = vpadd_f32( xy2, xy3);
01336 rLo = vadd_f32(rLo, zLo);
01337 rHi = vadd_f32(rHi, zHi);
01338
01339 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01340 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01341 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01342 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01343 iLo = vbsl_u32(maskLo, indexLo, iLo);
01344 iHi = vbsl_u32(maskHi, indexHi, iHi);
01345 indexLo = vadd_u32(indexLo, four);
01346 indexHi = vadd_u32(indexHi, four);
01347 }
01348 switch( count & 3 )
01349 {
01350 case 3:
01351 {
01352 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01353 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01354 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01355
01356 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01357 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01358 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
01359
01360 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01361 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01362 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
01363
01364 float32x2_t rLo = vpadd_f32( xy0, xy1);
01365 float32x2_t rHi = vpadd_f32( xy2, xy2);
01366 rLo = vadd_f32(rLo, zLo);
01367 rHi = vadd_f32(rHi, zHi);
01368
01369 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01370 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
01371 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01372 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
01373 iLo = vbsl_u32(maskLo, indexLo, iLo);
01374 iHi = vbsl_u32(maskHi, indexHi, iHi);
01375 }
01376 break;
01377 case 2:
01378 {
01379 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01380 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01381
01382 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01383 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
01384
01385 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
01386 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
01387
01388 float32x2_t rLo = vpadd_f32( xy0, xy1);
01389 rLo = vadd_f32(rLo, zLo);
01390
01391 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01392 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01393 iLo = vbsl_u32(maskLo, indexLo, iLo);
01394 }
01395 break;
01396 case 1:
01397 {
01398 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01399 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
01400 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
01401 float32x2_t zLo = vmul_f32( z0, vHi);
01402 float32x2_t rLo = vpadd_f32( xy0, xy0);
01403 rLo = vadd_f32(rLo, zLo);
01404 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
01405 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
01406 iLo = vbsl_u32(maskLo, indexLo, iLo);
01407 }
01408 break;
01409
01410 default:
01411 break;
01412 }
01413
01414
01415 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
01416 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
01417 iLo = vbsl_u32(mask, iHi, iLo);
01418
01419
01420 dotMinHi = vdup_lane_f32(dotMinLo, 1);
01421 iHi = vdup_lane_u32(iLo, 1);
01422 mask = vclt_f32( dotMinHi, dotMinLo );
01423 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
01424 iLo = vbsl_u32(mask, iHi, iLo);
01425
01426 *dotResult = vget_lane_f32( dotMinLo, 0);
01427 return vget_lane_u32(iLo, 0);
01428 }
01429
01430 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
01431 {
01432 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
01433 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
01434 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
01435 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
01436 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
01437 uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
01438 float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
01439
01440 unsigned long i = 0;
01441 for( ; i + 8 <= count; i += 8 )
01442 {
01443 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01444 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01445 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01446 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01447
01448
01449 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01450 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01451
01452 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01453 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01454
01455 xy0 = vmulq_f32(xy0, vLo);
01456 xy1 = vmulq_f32(xy1, vLo);
01457
01458 float32x4x2_t zb = vuzpq_f32( z0, z1);
01459 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01460 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01461 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01462 x = vaddq_f32(x, z);
01463
01464 uint32x4_t mask = vcltq_f32(x, minDot);
01465 minDot = vbslq_f32( mask, x, minDot);
01466 index = vbslq_u32(mask, local_index, index);
01467 local_index = vaddq_u32(local_index, four);
01468
01469 v0 = vld1q_f32_aligned_postincrement( vv );
01470 v1 = vld1q_f32_aligned_postincrement( vv );
01471 v2 = vld1q_f32_aligned_postincrement( vv );
01472 v3 = vld1q_f32_aligned_postincrement( vv );
01473
01474
01475 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01476 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01477
01478 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01479 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01480
01481 xy0 = vmulq_f32(xy0, vLo);
01482 xy1 = vmulq_f32(xy1, vLo);
01483
01484 zb = vuzpq_f32( z0, z1);
01485 z = vmulq_f32( zb.val[0], vHi);
01486 xy = vuzpq_f32( xy0, xy1);
01487 x = vaddq_f32(xy.val[0], xy.val[1]);
01488 x = vaddq_f32(x, z);
01489
01490 mask = vcltq_f32(x, minDot);
01491 minDot = vbslq_f32( mask, x, minDot);
01492 index = vbslq_u32(mask, local_index, index);
01493 local_index = vaddq_u32(local_index, four);
01494 }
01495
01496 for( ; i + 4 <= count; i += 4 )
01497 {
01498 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01499 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01500 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01501 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
01502
01503
01504 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01505 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
01506
01507 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01508 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
01509
01510 xy0 = vmulq_f32(xy0, vLo);
01511 xy1 = vmulq_f32(xy1, vLo);
01512
01513 float32x4x2_t zb = vuzpq_f32( z0, z1);
01514 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01515 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01516 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01517 x = vaddq_f32(x, z);
01518
01519 uint32x4_t mask = vcltq_f32(x, minDot);
01520 minDot = vbslq_f32( mask, x, minDot);
01521 index = vbslq_u32(mask, local_index, index);
01522 local_index = vaddq_u32(local_index, four);
01523 }
01524
01525 switch (count & 3) {
01526 case 3:
01527 {
01528 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01529 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01530 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
01531
01532
01533 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01534 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
01535
01536 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01537 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
01538
01539 xy0 = vmulq_f32(xy0, vLo);
01540 xy1 = vmulq_f32(xy1, vLo);
01541
01542 float32x4x2_t zb = vuzpq_f32( z0, z1);
01543 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01544 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
01545 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01546 x = vaddq_f32(x, z);
01547
01548 uint32x4_t mask = vcltq_f32(x, minDot);
01549 minDot = vbslq_f32( mask, x, minDot);
01550 index = vbslq_u32(mask, local_index, index);
01551 local_index = vaddq_u32(local_index, four);
01552 }
01553 break;
01554
01555 case 2:
01556 {
01557 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01558 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
01559
01560
01561 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
01562
01563 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
01564
01565 xy0 = vmulq_f32(xy0, vLo);
01566
01567 float32x4x2_t zb = vuzpq_f32( z0, z0);
01568 float32x4_t z = vmulq_f32( zb.val[0], vHi);
01569 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01570 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01571 x = vaddq_f32(x, z);
01572
01573 uint32x4_t mask = vcltq_f32(x, minDot);
01574 minDot = vbslq_f32( mask, x, minDot);
01575 index = vbslq_u32(mask, local_index, index);
01576 local_index = vaddq_u32(local_index, four);
01577 }
01578 break;
01579
01580 case 1:
01581 {
01582 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
01583
01584
01585 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
01586
01587 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
01588
01589 xy0 = vmulq_f32(xy0, vLo);
01590
01591 z = vmulq_f32( z, vHi);
01592 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
01593 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
01594 x = vaddq_f32(x, z);
01595
01596 uint32x4_t mask = vcltq_f32(x, minDot);
01597 minDot = vbslq_f32( mask, x, minDot);
01598 index = vbslq_u32(mask, local_index, index);
01599 local_index = vaddq_u32(local_index, four);
01600 }
01601 break;
01602
01603 default:
01604 break;
01605 }
01606
01607
01608
01609 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
01610 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
01611 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
01612
01613
01614 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
01615 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
01616 mask = vclt_f32( minDotO, minDot2 );
01617 minDot2 = vbsl_f32(mask, minDotO, minDot2);
01618 index2 = vbsl_u32(mask, indexHi, index2);
01619
01620 *dotResult = vget_lane_f32( minDot2, 0);
01621 return vget_lane_u32(index2, 0);
01622
01623 }
01624
01625 #else
01626 #error Unhandled __APPLE__ arch
01627 #endif
01628
01629 #endif
01630
01631