96 lines
2 KiB
C++
96 lines
2 KiB
C++
|
|
||
|
#ifndef B3_RADIXSORT32_H
|
||
|
#define B3_RADIXSORT32_H
|
||
|
|
||
|
#include "b3OpenCLArray.h"
|
||
|
|
||
|
struct b3SortData
|
||
|
{
|
||
|
union
|
||
|
{
|
||
|
unsigned int m_key;
|
||
|
unsigned int x;
|
||
|
};
|
||
|
|
||
|
union
|
||
|
{
|
||
|
unsigned int m_value;
|
||
|
unsigned int y;
|
||
|
|
||
|
};
|
||
|
};
|
||
|
#include "b3BufferInfoCL.h"
|
||
|
|
||
|
class b3RadixSort32CL
|
||
|
{
|
||
|
|
||
|
b3OpenCLArray<unsigned int>* m_workBuffer1;
|
||
|
b3OpenCLArray<unsigned int>* m_workBuffer2;
|
||
|
|
||
|
b3OpenCLArray<b3SortData>* m_workBuffer3;
|
||
|
b3OpenCLArray<b3SortData>* m_workBuffer4;
|
||
|
|
||
|
b3OpenCLArray<unsigned int>* m_workBuffer3a;
|
||
|
b3OpenCLArray<unsigned int>* m_workBuffer4a;
|
||
|
|
||
|
cl_command_queue m_commandQueue;
|
||
|
|
||
|
cl_kernel m_streamCountSortDataKernel;
|
||
|
cl_kernel m_streamCountKernel;
|
||
|
|
||
|
cl_kernel m_prefixScanKernel;
|
||
|
cl_kernel m_sortAndScatterSortDataKernel;
|
||
|
cl_kernel m_sortAndScatterKernel;
|
||
|
|
||
|
|
||
|
bool m_deviceCPU;
|
||
|
|
||
|
class b3PrefixScanCL* m_scan;
|
||
|
class b3FillCL* m_fill;
|
||
|
|
||
|
public:
|
||
|
struct b3ConstData
|
||
|
{
|
||
|
int m_n;
|
||
|
int m_nWGs;
|
||
|
int m_startBit;
|
||
|
int m_nBlocksPerWG;
|
||
|
};
|
||
|
enum
|
||
|
{
|
||
|
DATA_ALIGNMENT = 256,
|
||
|
WG_SIZE = 64,
|
||
|
BLOCK_SIZE = 256,
|
||
|
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
|
||
|
BITS_PER_PASS = 4,
|
||
|
NUM_BUCKET=(1<<BITS_PER_PASS),
|
||
|
// if you change this, change nPerWI in kernel as well
|
||
|
NUM_WGS = 20*6, // cypress
|
||
|
// NUM_WGS = 24*6, // cayman
|
||
|
// NUM_WGS = 32*4, // nv
|
||
|
};
|
||
|
|
||
|
|
||
|
private:
|
||
|
|
||
|
|
||
|
public:
|
||
|
|
||
|
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
|
||
|
|
||
|
virtual ~b3RadixSort32CL();
|
||
|
|
||
|
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
|
||
|
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
|
||
|
|
||
|
///keys only
|
||
|
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
|
||
|
|
||
|
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 );
|
||
|
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
|
||
|
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
|
||
|
|
||
|
};
|
||
|
#endif //B3_RADIXSORT32_H
|
||
|
|