bool searchIncremental3dSapOnGpu = true; #include #include "b3GpuSapBroadphase.h" #include "Bullet3Common/b3Vector3.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h" #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "kernels/sapKernels.h" #include "Bullet3Common/b3MinMax.h" #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" /* b3OpenCLArray m_pairCount; b3OpenCLArray m_allAabbsGPU; b3AlignedObjectArray m_allAabbsCPU; virtual b3OpenCLArray& getAllAabbsGPU() { return m_allAabbsGPU; } virtual b3AlignedObjectArray& getAllAabbsCPU() { return m_allAabbsCPU; } b3OpenCLArray m_sum; b3OpenCLArray m_sum2; b3OpenCLArray m_dst; b3OpenCLArray m_smallAabbsMappingGPU; b3AlignedObjectArray m_smallAabbsMappingCPU; b3OpenCLArray m_largeAabbsMappingGPU; b3AlignedObjectArray m_largeAabbsMappingCPU; b3OpenCLArray m_overlappingPairs; //temporary gpu work memory b3OpenCLArray m_gpuSmallSortData; b3OpenCLArray m_gpuSmallSortedAabbs; class b3PrefixScanFloat4CL* m_prefixScanFloat4; */ b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType) : m_context(ctx), m_device(device), m_queue(q), m_objectMinMaxIndexGPUaxis0(ctx, q), m_objectMinMaxIndexGPUaxis1(ctx, q), m_objectMinMaxIndexGPUaxis2(ctx, q), m_objectMinMaxIndexGPUaxis0prev(ctx, q), m_objectMinMaxIndexGPUaxis1prev(ctx, q), m_objectMinMaxIndexGPUaxis2prev(ctx, q), m_sortedAxisGPU0(ctx, q), m_sortedAxisGPU1(ctx, q), m_sortedAxisGPU2(ctx, q), m_sortedAxisGPU0prev(ctx, q), m_sortedAxisGPU1prev(ctx, q), m_sortedAxisGPU2prev(ctx, q), m_addedHostPairsGPU(ctx, q), m_removedHostPairsGPU(ctx, q), m_addedCountGPU(ctx, q), m_removedCountGPU(ctx, q), m_currentBuffer(-1), m_pairCount(ctx, q), m_allAabbsGPU(ctx, q), m_sum(ctx, q), m_sum2(ctx, q), m_dst(ctx, q), m_smallAabbsMappingGPU(ctx, q), m_largeAabbsMappingGPU(ctx, q), m_overlappingPairs(ctx, q), m_gpuSmallSortData(ctx, q), m_gpuSmallSortedAabbs(ctx, q) { const char* sapSrc = sapCL; cl_int errNum = 0; b3Assert(m_context); b3Assert(m_device); cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH); b3Assert(errNum == CL_SUCCESS); b3Assert(errNum == CL_SUCCESS); #ifndef __APPLE__ m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue); #else m_prefixScanFloat4 = 0; #endif m_sapKernel = 0; switch (kernelType) { case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU: { m_sapKernel = 0; break; } case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU: { m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_ORIGINAL: { m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_BARRIER: { m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg); break; } case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY: { m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); break; } default: { m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg); b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory"); } }; m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg); b3Assert(errNum == CL_SUCCESS); m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg); b3Assert(errNum == CL_SUCCESS); m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg); m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg); m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg); m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue); } b3GpuSapBroadphase::~b3GpuSapBroadphase() { delete m_sorter; delete m_prefixScanFloat4; clReleaseKernel(m_scatterKernel); clReleaseKernel(m_flipFloatKernel); clReleaseKernel(m_copyAabbsKernel); clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); clReleaseKernel(m_prepareSumVarianceKernel); } /// conservative test for overlap between two aabbs static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1, const b3Vector3& aabbMin2, const b3Vector3& aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap; overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap; return overlap; } //http://stereopsis.com/radix.html static unsigned int FloatFlip(float fl) { unsigned int f = *(unsigned int*)&fl; unsigned int mask = -(int)(f >> 31) | 0x80000000; return f ^ mask; }; void b3GpuSapBroadphase::init3dSap() { if (m_currentBuffer < 0) { m_allAabbsGPU.copyToHost(m_allAabbsCPU); m_currentBuffer = 0; for (int axis = 0; axis < 3; axis++) { for (int buf = 0; buf < 2; buf++) { int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = 2 * totalNumAabbs; m_sortedAxisCPU[axis][buf].resize(numEndPoints); if (buf == m_currentBuffer) { for (int i = 0; i < totalNumAabbs; i++) { m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1; m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2; m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1; m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1; } } } } for (int axis = 0; axis < 3; axis++) { m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } for (int axis = 0; axis < 3; axis++) { //int totalNumAabbs = m_allAabbsCPU.size(); int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size(); m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints); for (int i = 0; i < numEndPoints; i++) { int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value; int newDest = destIndex / 2; if (destIndex & 1) { m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i; } else { m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i; } } } } } static bool b3PairCmp(const b3Int4& p, const b3Int4& q) { return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y))); } static bool operator==(const b3Int4& a, const b3Int4& b) { return a.x == b.x && a.y == b.y; }; static bool operator<(const b3Int4& a, const b3Int4& b) { return a.x < b.x || (a.x == b.x && a.y < b.y); }; static bool operator>(const b3Int4& a, const b3Int4& b) { return a.x > b.x || (a.x == b.x && a.y > b.y); }; b3AlignedObjectArray addedHostPairs; b3AlignedObjectArray removedHostPairs; b3AlignedObjectArray preAabbs; void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap() { //static int framepje = 0; //printf("framepje=%d\n",framepje++); B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap"); addedHostPairs.resize(0); removedHostPairs.resize(0); b3Assert(m_currentBuffer >= 0); { preAabbs.resize(m_allAabbsCPU.size()); for (int i = 0; i < preAabbs.size(); i++) { preAabbs[i] = m_allAabbsCPU[i]; } } if (m_currentBuffer < 0) return; { B3_PROFILE("m_allAabbsGPU.copyToHost"); m_allAabbsGPU.copyToHost(m_allAabbsCPU); } b3AlignedObjectArray allPairs; { B3_PROFILE("m_overlappingPairs.copyToHost"); m_overlappingPairs.copyToHost(allPairs); } if (0) { { printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n", m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2], m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]); } { printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n", m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2], m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]); } { b3Int4 newPair; newPair.x = 40; newPair.y = 53; int index = allPairs.findBinarySearch(newPair); printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size()); { int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max); printf("overlap=%d\n", overlap); } if (preAabbs.size()) { int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max); printf("prevoverlap=%d\n", prevOverlap); } else { printf("unknown prevoverlap\n"); } } } if (0) { for (int i = 0; i < m_allAabbsCPU.size(); i++) { //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]); } for (int axis = 0; axis < 3; axis++) { for (int buf = 0; buf < 2; buf++) { b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2); } } } m_currentBuffer = 1 - m_currentBuffer; int totalNumAabbs = m_allAabbsCPU.size(); { B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)"); for (int i = 0; i < totalNumAabbs; i++) { unsigned int keyMin[3]; unsigned int keyMax[3]; for (int axis = 0; axis < 3; axis++) { float vmin = m_allAabbsCPU[i].m_min[axis]; float vmax = m_allAabbsCPU[i].m_max[axis]; keyMin[axis] = FloatFlip(vmin); keyMax[axis] = FloatFlip(vmax); m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1; m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2; m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1; m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1; } //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]); } } { B3_PROFILE("sort m_sortedAxisCPU"); for (int axis = 0; axis < 3; axis++) m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]); } #if 0 if (0) { for (int axis=0;axis<3;axis++) { //printf("axis %d\n",axis); for (int i=0;i m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) overlap = false; } // b3Assert(overlap2==overlap); bool prevOverlap = true; for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) prevOverlap = false; } //b3Assert(overlap==overlap2); if (dmin < 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; } else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); } } else { if (!overlap && prevOverlap) { //remove a pair b3Int4 removedPair; if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; } else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); } } //otherisMax } //if (dmin<0) } //if (otherIndex!=i) } //for (int j= } if (dmax != 0) { int stepMax = dmax < 0 ? -1 : 1; for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax) { int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y; int otherIndex = otherIndex2 / 2; if (otherIndex != i) { //bool otherIsMin = ((otherIndex2&1)==0); //if (otherIsMin) { //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max); //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max); bool overlap = true; for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x)) overlap = false; } //b3Assert(overlap2==overlap); bool prevOverlap = true; for (int ax = 0; ax < 3; ax++) { if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) || (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x)) prevOverlap = false; } if (dmax > 0) { if (overlap && !prevOverlap) { //add a pair b3Int4 newPair; if (i <= otherIndex) { newPair.x = i; newPair.y = otherIndex; } else { newPair.x = otherIndex; newPair.y = i; } addedHostPairs.push_back(newPair); } } else { if (!overlap && prevOverlap) { //if (otherIndex2&1==0) -> min? //remove a pair b3Int4 removedPair; if (i <= otherIndex) { removedPair.x = i; removedPair.y = otherIndex; } else { removedPair.x = otherIndex; removedPair.y = i; } removedHostPairs.push_back(removedPair); } } } //if (dmin<0) } //if (otherIndex!=i) } //for (int j= } } //for (int otherbuffer } //for (int axis=0; } //for (int i=0;i removedPositions; { B3_PROFILE("actual removing"); for (int i = 0; i < removedHostPairs.size(); i++) { b3Int4 removedPair = removedHostPairs[i]; if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y)) { int index1 = allPairs.findBinarySearch(removedPair); //#ifdef _DEBUG int index2 = allPairs.findLinearSearch(removedPair); b3Assert(index1 == index2); //b3Assert(index1!=allPairs.size()); if (index1 < allPairs.size()) //#endif//_DEBUG { uniqueRemovedPairs++; removedPositions.push_back(index1); { //printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y); } } } prevPair = removedPair; } if (uniqueRemovedPairs) { for (int i = 0; i < removedPositions.size(); i++) { allPairs[removedPositions[i]].x = INT_MAX; allPairs[removedPositions[i]].y = INT_MAX; } allPairs.quickSort(b3PairCmp); allPairs.resize(allPairs.size() - uniqueRemovedPairs); } } //if (uniqueRemovedPairs) // printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs); //printf("removedHostPairs.size = %d\n",removedHostPairs.size()); prevPair.x = -1; prevPair.y = -1; int uniqueAddedPairs = 0; b3AlignedObjectArray actualAddedPairs; { B3_PROFILE("actual adding"); for (int i = 0; i < addedHostPairs.size(); i++) { b3Int4 newPair = addedHostPairs[i]; if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y)) { //#ifdef _DEBUG int index1 = allPairs.findBinarySearch(newPair); int index2 = allPairs.findLinearSearch(newPair); b3Assert(index1 == index2); b3Assert(index1 == allPairs.size()); if (index1 != allPairs.size()) { printf("??\n"); } if (index1 == allPairs.size()) //#endif //_DEBUG { uniqueAddedPairs++; actualAddedPairs.push_back(newPair); } } prevPair = newPair; } for (int i = 0; i < actualAddedPairs.size(); i++) { //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y); allPairs.push_back(actualAddedPairs[i]); } } //if (uniqueAddedPairs) // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs); { B3_PROFILE("m_overlappingPairs.copyFromHost"); m_overlappingPairs.copyFromHost(allPairs); } } void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) { //test // if (m_currentBuffer>=0) // return calculateOverlappingPairsHostIncremental3Sap(); b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); m_allAabbsGPU.copyToHost(m_allAabbsCPU); int axis = 0; { B3_PROFILE("CPU compute best variance axis"); b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0); int numRigidBodies = m_smallAabbsMappingCPU.size(); for (int i = 0; i < numRigidBodies; i++) { b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]); b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]); b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f; s += centerAabb; s2 += centerAabb * centerAabb; } b3Vector3 v = s2 - (s * s) / (float)numRigidBodies; if (v[1] > v[0]) axis = 1; if (v[2] > v[axis]) axis = 2; } b3AlignedObjectArray hostPairs; { int numSmallAabbs = m_smallAabbsMappingCPU.size(); for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; for (int j = i + 1; j < numSmallAabbs; j++) { b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max)) { b3Int4 pair; int a = smallAabbi.m_minIndices[3]; int b = smallAabbj.m_minIndices[3]; if (a <= b) { pair.x = a; //store the original index in the unsorted aabb array pair.y = b; } else { pair.x = b; //store the original index in the unsorted aabb array pair.y = a; } hostPairs.push_back(pair); } } } } { int numSmallAabbs = m_smallAabbsMappingCPU.size(); for (int i = 0; i < numSmallAabbs; i++) { b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]]; //float reference = smallAabbi.m_max[axis]; int numLargeAabbs = m_largeAabbsMappingCPU.size(); for (int j = 0; j < numLargeAabbs; j++) { b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]]; if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max, (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max)) { b3Int4 pair; int a = largeAabbj.m_minIndices[3]; int b = smallAabbi.m_minIndices[3]; if (a <= b) { pair.x = a; pair.y = b; //store the original index in the unsorted aabb array } else { pair.x = b; pair.y = a; //store the original index in the unsorted aabb array } hostPairs.push_back(pair); } } } } if (hostPairs.size() > maxPairs) { hostPairs.resize(maxPairs); } if (hostPairs.size()) { m_overlappingPairs.copyFromHost(hostPairs); } else { m_overlappingPairs.resize(0); } //init3dSap(); } void b3GpuSapBroadphase::reset() { m_allAabbsGPU.resize(0); m_allAabbsCPU.resize(0); m_smallAabbsMappingGPU.resize(0); m_smallAabbsMappingCPU.resize(0); m_pairCount.resize(0); m_largeAabbsMappingGPU.resize(0); m_largeAabbsMappingCPU.resize(0); } void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) { if (m_sapKernel == 0) { calculateOverlappingPairsHost(maxPairs); return; } //if (m_currentBuffer>=0) // return calculateOverlappingPairsHostIncremental3Sap(); //calculateOverlappingPairsHost(maxPairs); B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs"); int axis = 0; { //bool syncOnHost = false; int numSmallAabbs = m_smallAabbsMappingCPU.size(); if (m_prefixScanFloat4 && numSmallAabbs) { B3_PROFILE("GPU compute best variance axis"); if (m_dst.size() != (numSmallAabbs + 1)) { m_dst.resize(numSmallAabbs + 128); m_sum.resize(numSmallAabbs + 128); m_sum2.resize(numSmallAabbs + 128); m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow? } b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel"); launcher.setBuffer(m_allAabbsGPU.getBufferCL()); launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL()); launcher.setBuffer(m_sum.getBufferCL()); launcher.setBuffer(m_sum2.getBufferCL()); launcher.setConst(numSmallAabbs); int num = numSmallAabbs; launcher.launch1D(num); b3Vector3 s; b3Vector3 s2; m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s); m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2); b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs; if (v[1] > v[0]) axis = 1; if (v[2] > v[axis]) axis = 2; } m_gpuSmallSortData.resize(numSmallAabbs); #if 1 if (m_smallAabbsMappingGPU.size()) { B3_PROFILE("flipFloatKernel"); b3BufferInfoCL bInfo[] = { b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())}; b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel"); launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numSmallAabbs); launcher.setConst(axis); int num = numSmallAabbs; launcher.launch1D(num); clFinish(m_queue); } if (m_gpuSmallSortData.size()) { B3_PROFILE("gpu radix sort"); m_sorter->execute(m_gpuSmallSortData); clFinish(m_queue); } m_gpuSmallSortedAabbs.resize(numSmallAabbs); if (numSmallAabbs) { B3_PROFILE("scatterKernel"); b3BufferInfoCL bInfo[] = { b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true), b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true), b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel "); launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numSmallAabbs); int num = numSmallAabbs; launcher.launch1D(num); clFinish(m_queue); } m_overlappingPairs.resize(maxPairs); m_pairCount.resize(0); m_pairCount.push_back(0); int numPairs = 0; { int numLargeAabbs = m_largeAabbsMappingGPU.size(); if (numLargeAabbs && numSmallAabbs) { //@todo B3_PROFILE("sap2Kernel"); b3BufferInfoCL bInfo[] = { b3BufferInfoCL(m_allAabbsGPU.getBufferCL()), b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()), b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel"); launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numLargeAabbs); launcher.setConst(numSmallAabbs); launcher.setConst(axis); launcher.setConst(maxPairs); //@todo: use actual maximum work item sizes of the device instead of hardcoded values launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64); numPairs = m_pairCount.at(0); if (numPairs > maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); numPairs = maxPairs; } } } if (m_gpuSmallSortedAabbs.size()) { B3_PROFILE("sapKernel"); b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())}; b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel"); launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL)); launcher.setConst(numSmallAabbs); launcher.setConst(axis); launcher.setConst(maxPairs); int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; for (int i=0;i maxPairs) { b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); numPairs = maxPairs; m_pairCount.resize(0); m_pairCount.push_back(maxPairs); } } #else int numPairs = 0; b3LauncherCL launcher(m_queue, m_sapKernel); const char* fileName = "m_sapKernelArgs.bin"; FILE* f = fopen(fileName, "rb"); if (f) { int sizeInBytes = 0; if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) { printf("error, cannot get file size\n"); exit(0); } unsigned char* buf = (unsigned char*)malloc(sizeInBytes); fread(buf, sizeInBytes, 1, f); int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context); int num = *(int*)&buf[serializedBytes]; launcher.launch1D(num); b3OpenCLArray pairCount(m_context, m_queue); int numElements = launcher.m_arrays[2]->size() / sizeof(int); pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements); numPairs = pairCount.at(0); //printf("overlapping pairs = %d\n",numPairs); b3AlignedObjectArray hostOoverlappingPairs; b3OpenCLArray tmpGpuPairs(m_context, m_queue); tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs); tmpGpuPairs.copyToHost(hostOoverlappingPairs); m_overlappingPairs.copyFromHost(hostOoverlappingPairs); //printf("hello %d\n", m_overlappingPairs.size()); free(buf); fclose(f); } else { printf("error: cannot find file %s\n", fileName); } clFinish(m_queue); #endif m_overlappingPairs.resize(numPairs); } //B3_PROFILE("GPU_RADIX SORT"); //init3dSap(); } void b3GpuSapBroadphase::writeAabbsToGpu() { m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU); m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU); m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this } void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; } aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size()); m_allAabbsCPU.push_back(aabb); } void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) { int index = userPtr; b3SapAabb aabb; for (int i = 0; i < 4; i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; } aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size()); m_allAabbsCPU.push_back(aabb); } cl_mem b3GpuSapBroadphase::getAabbBufferWS() { return m_allAabbsGPU.getBufferCL(); } int b3GpuSapBroadphase::getNumOverlap() { return m_overlappingPairs.size(); } cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); } b3OpenCLArray& b3GpuSapBroadphase::getOverlappingPairsGPU() { return m_overlappingPairs; } b3OpenCLArray& b3GpuSapBroadphase::getSmallAabbIndicesGPU() { return m_smallAabbsMappingGPU; } b3OpenCLArray& b3GpuSapBroadphase::getLargeAabbIndicesGPU() { return m_largeAabbsMappingGPU; }