virtualx-engine/thirdparty/bullet/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
2019-01-07 12:30:35 +01:00

646 lines
18 KiB
C++

#include "b3RadixSort32CL.h"
#include "b3LauncherCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3PrefixScanCL.h"
#include "b3FillCL.h"
#define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
#include "kernels/RadixSort32KernelsCL.h"
b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
: m_commandQueue(queue)
{
b3OpenCLDeviceInfo info;
b3OpenCLUtils::getDeviceInfo(device, &info);
m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
if (initialCapacity > 0)
{
m_workBuffer1->resize(initialCapacity);
m_workBuffer3->resize(initialCapacity);
m_workBuffer3a->resize(initialCapacity);
m_workBuffer4->resize(initialCapacity);
m_workBuffer4a->resize(initialCapacity);
}
m_scan = new b3PrefixScanCL(ctx, device, queue);
m_fill = new b3FillCL(ctx, device, queue);
const char* additionalMacros = "";
cl_int pErrNum;
const char* kernelSource = radixSort32KernelsCL;
cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
b3Assert(sortProg);
m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_streamCountSortDataKernel);
m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_streamCountKernel);
if (m_deviceCPU)
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
}
else
{
m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterSortDataKernel);
m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_sortAndScatterKernel);
}
m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
b3Assert(m_prefixScanKernel);
}
b3RadixSort32CL::~b3RadixSort32CL()
{
delete m_scan;
delete m_fill;
delete m_workBuffer1;
delete m_workBuffer2;
delete m_workBuffer3;
delete m_workBuffer3a;
delete m_workBuffer4;
delete m_workBuffer4a;
clReleaseKernel(m_streamCountSortDataKernel);
clReleaseKernel(m_streamCountKernel);
clReleaseKernel(m_sortAndScatterSortDataKernel);
clReleaseKernel(m_sortAndScatterKernel);
clReleaseKernel(m_prefixScanKernel);
}
void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
{
int n = inout.size();
const int BITS_PER_PASS = 8;
const int NUM_TABLES = (1 << BITS_PER_PASS);
int tables[NUM_TABLES];
int counter[NUM_TABLES];
b3SortData* src = &inout[0];
b3AlignedObjectArray<b3SortData> workbuffer;
workbuffer.resize(inout.size());
b3SortData* dst = &workbuffer[0];
int count = 0;
for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
{
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = 0;
}
for (int i = 0; i < n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
tables[tableIdx]++;
}
//#define TEST
#ifdef TEST
printf("histogram size=%d\n", NUM_TABLES);
for (int i = 0; i < NUM_TABLES; i++)
{
if (tables[i] != 0)
{
printf("tables[%d]=%d]\n", i, tables[i]);
}
}
#endif //TEST \
// prefix scan
int sum = 0;
for (int i = 0; i < NUM_TABLES; i++)
{
int iData = tables[i];
tables[i] = sum;
sum += iData;
counter[i] = 0;
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
dst[tables[tableIdx] + counter[tableIdx]] = src[i];
counter[tableIdx]++;
}
b3Swap(src, dst);
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy
}
}
void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
b3AlignedObjectArray<b3SortData> inout;
keyValuesInOut.copyToHost(inout);
executeHost(inout, sortBits);
keyValuesInOut.copyFromHost(inout);
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
{
}
//#define DEBUG_RADIXSORT
//#define DEBUG_RADIXSORT2
void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
{
int originalSize = keyValuesInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
#ifdef DEBUG_RADIXSORT2
b3AlignedObjectArray<b3SortData> test2;
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
#endif //DEBUG_RADIXSORT2
b3OpenCLArray<b3SortData>* src = 0;
if (workingSize % dataAlignment)
{
workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
m_workBuffer4->resize(workingSize);
b3SortData fillValue;
fillValue.m_key = 0xffffffff;
fillValue.m_value = 0xffffffff;
#define USE_BTFILL
#ifdef USE_BTFILL
m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
#else
//fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
for (int i = originalSize; i < workingSize; i++)
{
m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
}
#endif //USE_BTFILL
src = m_workBuffer4;
}
else
{
src = &keyValuesInOut;
m_workBuffer4->resize(0);
}
b3Assert(workingSize % DATA_ALIGNMENT == 0);
int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert(BITS_PER_PASS == 4);
b3Assert(WG_SIZE == 64);
b3Assert((sortBits & 0x3) == 0);
b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count = 0;
for (int ib = 0; ib < sortBits; ib += 4)
{
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
int num = NUM_WGS * WG_SIZE;
launcher.launch1D(num, WG_SIZE);
}
#ifdef DEBUG_RADIXSORT
b3AlignedObjectArray<unsigned int> testHist;
srcHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
#endif //DEBUG_RADIXSORT
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
bool fastScan = false;
#else
bool fastScan = !m_deviceCPU; //only use fast scan on GPU
#endif
if (fastScan)
{ // prefix scan group histogram
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(128, 128);
destHisto = srcHisto;
}
else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
for (int i = 0; i < testHist.size(); i += NUM_WGS)
{
printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
}
#endif //DEBUG_RADIXSORT
#define USE_GPU
#ifdef USE_GPU
if (src->size())
{ // local sort and distribute
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
#else
{
#define NUM_TABLES 16
//#define SEQUENTIAL
#ifdef SEQUENTIAL
int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int tables[NUM_TABLES];
int startBit = ib;
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
src->copyToHost(srcHost);
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = testHist[i * NUM_WGS];
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx]++;
}
#else
int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int tables[NUM_TABLES];
b3AlignedObjectArray<b3SortData> dstHostOK;
dstHostOK.resize(src->size());
destHisto->copyToHost(testHist);
b3AlignedObjectArray<b3SortData> srcHost;
src->copyToHost(srcHost);
int blockSize = 256;
int nBlocksPerWG = cdata.m_nBlocksPerWG;
int startBit = ib;
{
for (int i = 0; i < NUM_TABLES; i++)
{
tables[i] = testHist[i * NUM_WGS];
}
// distribute
for (int i = 0; i < n; i++)
{
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
counter2[tableIdx]++;
}
}
b3AlignedObjectArray<b3SortData> dstHost;
dstHost.resize(src->size());
int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
{
int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
{
for (int lIdx = 0; lIdx < 64; lIdx++)
{
int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
// MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
// Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
// AMD: AtomInc performs better while NV prefers ++
for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
{
if (addr + j < n)
{
// printf ("addr+j=%d\n", addr+j);
int i = addr + j;
int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
b3SortData ok = dstHostOK[destIndex];
if (ok.m_key != srcHost[i].m_key)
{
printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
}
if (ok.m_value != srcHost[i].m_value)
{
printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
}
dstHost[destIndex] = srcHost[i];
counter[tableIdx]++;
}
}
}
}
}
#endif //SEQUENTIAL
dst->copyFromHost(dstHost);
}
#endif //USE_GPU
#ifdef DEBUG_RADIXSORT
destHisto->copyToHost(testHist);
printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
for (int i = 0; i < testHist.size(); i++)
{
if (testHist[i] != 0)
printf("testHist[%d]=%d\n", i, testHist[i]);
}
#endif //DEBUG_RADIXSORT
b3Swap(src, dst);
b3Swap(srcHisto, destHisto);
#ifdef DEBUG_RADIXSORT2
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
if (test2[i].m_key != test2[i].m_value)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
}
#endif //DEBUG_RADIXSORT2
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4->size())
{
m_workBuffer4->resize(originalSize);
keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
}
#ifdef DEBUG_RADIXSORT
keyValuesInOut.copyToHost(test2);
printf("numElem = %d\n", test2.size());
for (int i = 0; i < test2.size(); i++)
{
printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
}
#endif
}
void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
{
int originalSize = keysInOut.size();
int workingSize = originalSize;
int dataAlignment = DATA_ALIGNMENT;
b3OpenCLArray<unsigned int>* src = 0;
if (workingSize % dataAlignment)
{
workingSize += dataAlignment - (workingSize % dataAlignment);
m_workBuffer4a->copyFromOpenCLArray(keysInOut);
m_workBuffer4a->resize(workingSize);
unsigned int fillValue = 0xffffffff;
m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
src = m_workBuffer4a;
}
else
{
src = &keysInOut;
m_workBuffer4a->resize(0);
}
b3Assert(workingSize % DATA_ALIGNMENT == 0);
int minCap = NUM_BUCKET * NUM_WGS;
int n = workingSize;
m_workBuffer1->resize(minCap);
m_workBuffer3->resize(workingSize);
m_workBuffer3a->resize(workingSize);
// ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
b3Assert(BITS_PER_PASS == 4);
b3Assert(WG_SIZE == 64);
b3Assert((sortBits & 0x3) == 0);
b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
int nWGs = NUM_WGS;
b3ConstData cdata;
{
int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
int nBlocks = (n + blockSize - 1) / (blockSize);
cdata.m_n = n;
cdata.m_nWGs = NUM_WGS;
cdata.m_startBit = 0;
cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
if (nBlocks < NUM_WGS)
{
cdata.m_nBlocksPerWG = 1;
nWGs = nBlocks;
}
}
int count = 0;
for (int ib = 0; ib < sortBits; ib += 4)
{
cdata.m_startBit = ib;
if (src->size())
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
int num = NUM_WGS * WG_SIZE;
launcher.launch1D(num, WG_SIZE);
}
//fast prefix scan is not working properly on Mac OSX yet
#ifdef __APPLE__
bool fastScan = false;
#else
bool fastScan = !m_deviceCPU;
#endif
if (fastScan)
{ // prefix scan group histogram
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(128, 128);
destHisto = srcHisto;
}
else
{
//unsigned int sum; //for debugging
m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
}
if (src->size())
{ // local sort and distribute
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(cdata);
launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
}
b3Swap(src, dst);
b3Swap(srcHisto, destHisto);
count++;
}
if (count & 1)
{
b3Assert(0); //need to copy from workbuffer to keyValuesInOut
}
if (m_workBuffer4a->size())
{
m_workBuffer4a->resize(originalSize);
keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
}
}