00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
00032 {
00033 int3 gridPos;
00034 gridPos.x = (int)floor((p.x - BT_GPU_params.m_worldOriginX) / BT_GPU_params.m_cellSizeX);
00035 gridPos.y = (int)floor((p.y - BT_GPU_params.m_worldOriginY) / BT_GPU_params.m_cellSizeY);
00036 gridPos.z = (int)floor((p.z - BT_GPU_params.m_worldOriginZ) / BT_GPU_params.m_cellSizeZ);
00037 return gridPos;
00038 }
00039
00040
00041
00042
00043 BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
00044 {
00045 gridPos.x = BT_GPU_max(0, BT_GPU_min(gridPos.x, (int)BT_GPU_params.m_gridSizeX - 1));
00046 gridPos.y = BT_GPU_max(0, BT_GPU_min(gridPos.y, (int)BT_GPU_params.m_gridSizeY - 1));
00047 gridPos.z = BT_GPU_max(0, BT_GPU_min(gridPos.z, (int)BT_GPU_params.m_gridSizeZ - 1));
00048 return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
00049 }
00050
00051
00052
00053
00054 BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
00055 {
00056 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00057 if(index >= (int)numBodies)
00058 {
00059 return;
00060 }
00061 bt3DGrid3F1U bbMin = pAABB[index*2];
00062 bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
00063 float4 pos;
00064 pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
00065 pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
00066 pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
00067
00068 int3 gridPos = bt3DGrid_calcGridPos(pos);
00069 uint gridHash = bt3DGrid_calcGridHash(gridPos);
00070
00071 pHash[index] = BT_GPU_make_uint2(gridHash, index);
00072 }
00073
00074
00075
00076 BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
00077 {
00078 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00079 if(index >= (int)numBodies)
00080 {
00081 return;
00082 }
00083 uint2 sortedData = pHash[index];
00084
00085
00086
00087 BT_GPU___shared__ uint sharedHash[257];
00088 sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
00089 if((index > 0) && (BT_GPU_threadIdx.x == 0))
00090 {
00091
00092 volatile uint2 prevData = pHash[index-1];
00093 sharedHash[0] = prevData.x;
00094 }
00095 BT_GPU___syncthreads();
00096 if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
00097 {
00098 cellStart[sortedData.x] = index;
00099 }
00100 }
00101
00102
00103
00104 BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
00105 {
00106 return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
00107 (min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
00108 (min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
00109 }
00110
00111
00112
00113 BT_GPU___device__ void findPairsInCell( int3 gridPos,
00114 uint index,
00115 uint2* pHash,
00116 uint* pCellStart,
00117 bt3DGrid3F1U* pAABB,
00118 uint* pPairBuff,
00119 uint2* pPairBuffStartCurr,
00120 uint numBodies)
00121 {
00122 if ( (gridPos.x < 0) || (gridPos.x > (int)BT_GPU_params.m_gridSizeX - 1)
00123 || (gridPos.y < 0) || (gridPos.y > (int)BT_GPU_params.m_gridSizeY - 1)
00124 || (gridPos.z < 0) || (gridPos.z > (int)BT_GPU_params.m_gridSizeZ - 1))
00125 {
00126 return;
00127 }
00128 uint gridHash = bt3DGrid_calcGridHash(gridPos);
00129
00130 uint bucketStart = pCellStart[gridHash];
00131 if (bucketStart == 0xffffffff)
00132 {
00133 return;
00134 }
00135
00136 uint2 sortedData = pHash[index];
00137 uint unsorted_indx = sortedData.y;
00138 bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
00139 bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
00140 uint handleIndex = min0.uw;
00141 uint2 start_curr = pPairBuffStartCurr[handleIndex];
00142 uint start = start_curr.x;
00143 uint curr = start_curr.y;
00144 uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
00145 uint curr_max = start_curr_next.x - start - 1;
00146 uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
00147 bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
00148 for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
00149 {
00150 uint2 cellData = pHash[index2];
00151 if (cellData.x != gridHash)
00152 {
00153 break;
00154 }
00155 uint unsorted_indx2 = cellData.y;
00156 if (unsorted_indx2 < unsorted_indx)
00157 {
00158 bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
00159 bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
00160 if(cudaTestAABBOverlap(min0, max0, min1, max1))
00161 {
00162 uint handleIndex2 = min1.uw;
00163 uint k;
00164 for(k = 0; k < curr; k++)
00165 {
00166 uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
00167 if(old_pair == handleIndex2)
00168 {
00169 pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
00170 break;
00171 }
00172 }
00173 if(k == curr)
00174 {
00175 if(curr >= curr_max)
00176 {
00177 break;
00178 }
00179 pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
00180 curr++;
00181 }
00182 }
00183 }
00184 }
00185 pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
00186 return;
00187 }
00188
00189
00190
00191 BT_GPU___global__ void findOverlappingPairsD( bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart,
00192 uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
00193 {
00194 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00195 if(index >= (int)numBodies)
00196 {
00197 return;
00198 }
00199 uint2 sortedData = pHash[index];
00200 uint unsorted_indx = sortedData.y;
00201 bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
00202 bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
00203 float4 pos;
00204 pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
00205 pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
00206 pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
00207
00208 int3 gridPos = bt3DGrid_calcGridPos(pos);
00209
00210 for(int z=-1; z<=1; z++) {
00211 for(int y=-1; y<=1; y++) {
00212 for(int x=-1; x<=1; x++) {
00213 findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
00214 }
00215 }
00216 }
00217 }
00218
00219
00220
00221 BT_GPU___global__ void findPairsLargeD( bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
00222 uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
00223 {
00224 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00225 if(index >= (int)numBodies)
00226 {
00227 return;
00228 }
00229 uint2 sortedData = pHash[index];
00230 uint unsorted_indx = sortedData.y;
00231 bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
00232 bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
00233 uint handleIndex = min0.uw;
00234 uint2 start_curr = pPairBuffStartCurr[handleIndex];
00235 uint start = start_curr.x;
00236 uint curr = start_curr.y;
00237 uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
00238 uint curr_max = start_curr_next.x - start - 1;
00239 for(uint i = 0; i < numLarge; i++)
00240 {
00241 uint indx2 = numBodies + i;
00242 bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
00243 bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
00244 if(cudaTestAABBOverlap(min0, max0, min1, max1))
00245 {
00246 uint k;
00247 uint handleIndex2 = min1.uw;
00248 for(k = 0; k < curr; k++)
00249 {
00250 uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
00251 if(old_pair == handleIndex2)
00252 {
00253 pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
00254 break;
00255 }
00256 }
00257 if(k == curr)
00258 {
00259 pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
00260 if(curr >= curr_max)
00261 {
00262 break;
00263 }
00264 curr++;
00265 }
00266 }
00267 }
00268 pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
00269 return;
00270 }
00271
00272
00273
00274 BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr,
00275 uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
00276 {
00277 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00278 if(index >= (int)numBodies)
00279 {
00280 return;
00281 }
00282 bt3DGrid3F1U bbMin = pAABB[index * 2];
00283 uint handleIndex = bbMin.uw;
00284 uint2 start_curr = pPairBuffStartCurr[handleIndex];
00285 uint start = start_curr.x;
00286 uint curr = start_curr.y;
00287 uint *pInp = pPairBuff + start;
00288 uint num_changes = 0;
00289 for(uint k = 0; k < curr; k++, pInp++)
00290 {
00291 if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
00292 {
00293 num_changes++;
00294 }
00295 }
00296 pPairScan[index+1] = num_changes;
00297 }
00298
00299
00300
00301 BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
00302 uint* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
00303 {
00304 int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
00305 if(index >= (int)numBodies)
00306 {
00307 return;
00308 }
00309 bt3DGrid3F1U bbMin = pAABB[index * 2];
00310 uint handleIndex = bbMin.uw;
00311 uint2 start_curr = pPairBuffStartCurr[handleIndex];
00312 uint start = start_curr.x;
00313 uint curr = start_curr.y;
00314 uint* pInp = pPairBuff + start;
00315 uint* pOut = pPairOut + pPairScan[index];
00316 uint* pOut2 = pInp;
00317 uint num = 0;
00318 for(uint k = 0; k < curr; k++, pInp++)
00319 {
00320 if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
00321 {
00322 *pOut = *pInp;
00323 pOut++;
00324 }
00325 if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
00326 {
00327 *pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
00328 pOut2++;
00329 num++;
00330 }
00331 }
00332 pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
00333 }
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346 extern "C"
00347 {
00348
00349
00350
00351 void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
00352 {
00353 int numThreads, numBlocks;
00354 BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
00355
00356 BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
00357
00358 BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
00359 }
00360
00361
00362
00363 void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
00364 {
00365 int numThreads, numBlocks;
00366 BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
00367 BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
00368 BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
00369 BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
00370 }
00371
00372
00373
00374 void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies))
00375 {
00376 #if B_CUDA_USE_TEX
00377 BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
00378 #endif
00379 int numThreads, numBlocks;
00380 BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
00381 BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
00382 BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
00383 #if B_CUDA_USE_TEX
00384 BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
00385 #endif
00386 }
00387
00388
00389
00390 void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
00391 {
00392 #if B_CUDA_USE_TEX
00393 BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
00394 #endif
00395 int numThreads, numBlocks;
00396 BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
00397 BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
00398 BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
00399 #if B_CUDA_USE_TEX
00400 BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
00401 #endif
00402 }
00403
00404
00405
00406 void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
00407 {
00408 int numThreads, numBlocks;
00409 BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
00410 BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
00411 BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
00412 }
00413
00414
00415
00416 void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
00417 {
00418 int numThreads, numBlocks;
00419 BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
00420 BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
00421 BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
00422 }
00423
00424
00425
00426 }
00427
00428
00429
00430