virtualx-engine/thirdparty/bullet/Bullet3OpenCL/RigidBody/b3Solver.cpp

/*
Copyright (c) 2012 Advanced Micro Devices, Inc.  

This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, 
including commercial applications, and to alter it and redistribute it freely, 
subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Takahiro Harada


#include "b3Solver.h"

///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
bool useNewBatchingKernel = true;
bool gConvertConstraintOnCpu = false;

#define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
#define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
#define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
#define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
#define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
#define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"

#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"

#include "kernels/solverSetup.h"
#include "kernels/solverSetup2.h"

#include "kernels/solveContact.h"
#include "kernels/solveFriction.h"

#include "kernels/batchingKernels.h"
#include "kernels/batchingKernelsNew.h"


#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Common/b3Vector3.h"

struct SolverDebugInfo
{
	int m_valInt0;
	int m_valInt1;
	int m_valInt2;
	int m_valInt3;
	
	int m_valInt4;
	int m_valInt5;
	int m_valInt6;
	int m_valInt7;

	int m_valInt8;
	int m_valInt9;
	int m_valInt10;
	int m_valInt11;

	int	m_valInt12;
	int	m_valInt13;
	int	m_valInt14;
	int	m_valInt15;


	float m_val0;
	float m_val1;
	float m_val2;
	float m_val3;
};


class SolverDeviceInl
{
public:
	struct ParallelSolveData
	{
		b3OpenCLArray<unsigned int>* m_numConstraints;
		b3OpenCLArray<unsigned int>* m_offsets;
	};
};


b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
			:
			m_context(ctx),
			m_device(device),
			m_queue(queue),
			m_batchSizes(ctx,queue),
			m_nIterations(4)
{
	m_sort32 = new b3RadixSort32CL(ctx,device,queue);
	m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
	m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS);

	const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );

	m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
	m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);

	m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS );
	m_numConstraints->resize(B3_SOLVER_N_CELLS);

	m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS);
	m_offsets->resize(B3_SOLVER_N_CELLS);
	const char* additionalMacros = "";
//	const char* srcFileNameForCaching="";


	cl_int pErrNum;
	const char* batchKernelSource = batchingKernelsCL;
	const char* batchKernelNewSource = batchingKernelsNewCL;
	
	const char* solverSetupSource = solverSetupCL;
	const char* solverSetup2Source = solverSetup2CL;
	const char* solveContactSource = solveContactCL;
	const char* solveFrictionSource = solveFrictionCL;
	
	
	{
		
		cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
		b3Assert(solveContactProg);
		
		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
		b3Assert(solveFrictionProg);

		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
		b3Assert(solverSetup2Prog);

		
		cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
		b3Assert(solverSetupProg);
		
		
		m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
		b3Assert(m_solveFrictionKernel);

		m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
		b3Assert(m_solveContactKernel);
		
		m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
		b3Assert(m_contactToConstraintKernel);
			
		m_setSortDataKernel =  b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
		b3Assert(m_setSortDataKernel);
				
		m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
		b3Assert(m_reorderContactKernel);
		

		m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
		b3Assert(m_copyConstraintKernel);
		
	}

	{
		cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
		//cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
		b3Assert(batchingProg);
		
		m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
		b3Assert(m_batchingKernel);
	}
	{
		cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
		b3Assert(batchingNewProg);

		m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
		//m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
		b3Assert(m_batchingKernelNew);
	}
}
		
b3Solver::~b3Solver()
{
	delete m_offsets;
	delete m_numConstraints;
	delete m_sortDataBuffer;
	delete m_contactBuffer2;

	delete m_sort32;
	delete m_scan;
	delete m_search;


	clReleaseKernel(m_batchingKernel);
	clReleaseKernel(m_batchingKernelNew);
	
	clReleaseKernel( m_solveContactKernel);
	clReleaseKernel( m_solveFrictionKernel);

	clReleaseKernel( m_contactToConstraintKernel);
	clReleaseKernel( m_setSortDataKernel);
	clReleaseKernel( m_reorderContactKernel);
	clReleaseKernel( m_copyConstraintKernel);
			
}


template<bool JACOBI>
static
__inline
void solveContact(b3GpuConstraint4& cs, 
	const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
	const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
	float maxRambdaDt[4], float minRambdaDt[4])
{

	b3Vector3 dLinVelA; dLinVelA.setZero();
	b3Vector3 dAngVelA; dAngVelA.setZero();
	b3Vector3 dLinVelB; dLinVelB.setZero();
	b3Vector3 dAngVelB; dAngVelB.setZero();

	for(int ic=0; ic<4; ic++)
	{
		//	dont necessary because this makes change to 0
		if( cs.m_jacCoeffInv[ic] == 0.f ) continue;

		{
			b3Vector3 angular0, angular1, linear;
			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );

			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
			rambdaDt *= cs.m_jacCoeffInv[ic];

			{
				float prevSum = cs.m_appliedRambdaDt[ic];
				float updated = prevSum;
				updated += rambdaDt;
				updated = b3Max( updated, minRambdaDt[ic] );
				updated = b3Min( updated, maxRambdaDt[ic] );
				rambdaDt = updated - prevSum;
				cs.m_appliedRambdaDt[ic] = updated;
			}

			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
#ifdef _WIN32
            b3Assert(_finite(linImp0.getX()));
			b3Assert(_finite(linImp1.getX()));
#endif
			if( JACOBI )
			{
				dLinVelA += linImp0;
				dAngVelA += angImp0;
				dLinVelB += linImp1;
				dAngVelB += angImp1;
			}
			else
			{
				linVelA += linImp0;
				angVelA += angImp0;
				linVelB += linImp1;
				angVelB += angImp1;
			}
		}
	}

	if( JACOBI )
	{
		linVelA += dLinVelA;
		angVelA += dAngVelA;
		linVelB += dLinVelB;
		angVelB += dAngVelB;
	}

}


	static
	__inline
	void solveFriction(b3GpuConstraint4& cs, 
		const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
		const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB, 
		float maxRambdaDt[4], float minRambdaDt[4])
	{

		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
		const b3Vector3& center = (const b3Vector3&)cs.m_center;

		b3Vector3 n = -(const b3Vector3&)cs.m_linear;

		b3Vector3 tangent[2];
#if 1		
		b3PlaneSpace1 (n, tangent[0],tangent[1]);
#else
		b3Vector3 r = cs.m_worldPos[0]-center;
		tangent[0] = cross3( n, r );
		tangent[1] = cross3( tangent[0], n );
		tangent[0] = normalize3( tangent[0] );
		tangent[1] = normalize3( tangent[1] );
#endif

		b3Vector3 angular0, angular1, linear;
		b3Vector3 r0 = center - posA;
		b3Vector3 r1 = center - posB;
		for(int i=0; i<2; i++)
		{
			setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
				linVelA, angVelA, linVelB, angVelB );
			rambdaDt *= cs.m_fJacCoeffInv[i];

				{
					float prevSum = cs.m_fAppliedRambdaDt[i];
					float updated = prevSum;
					updated += rambdaDt;
					updated = b3Max( updated, minRambdaDt[i] );
					updated = b3Min( updated, maxRambdaDt[i] );
					rambdaDt = updated - prevSum;
					cs.m_fAppliedRambdaDt[i] = updated;
				}

			b3Vector3 linImp0 = invMassA*linear*rambdaDt;
			b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
			b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
			b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
#ifdef _WIN32
			b3Assert(_finite(linImp0.getX()));
			b3Assert(_finite(linImp1.getX()));
#endif
			linVelA += linImp0;
			angVelA += angImp0;
			linVelB += linImp1;
			angVelB += angImp1;
		}

		{	//	angular damping for point constraint
			b3Vector3 ab = ( posB - posA ).normalized();
			b3Vector3 ac = ( center - posA ).normalized();
			if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
			{
				float angNA = b3Dot( n, angVelA );
				float angNB = b3Dot( n, angVelB );

				angVelA -= (angNA*0.1f)*n;
				angVelB -= (angNB*0.1f)*n;
			}
		}

	}
/*
 b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
	b3AlignedObjectArray<b3InertiaData>& m_shapes;
	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
	b3AlignedObjectArray<int>* m_batchSizes;
	int m_cellIndex;
	int m_curWgidx;
	int m_start;
	int m_nConstraints;
	bool m_solveFriction;
	int m_maxNumBatches;
 */

struct SolveTask// : public ThreadPool::Task
{
	SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,  b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
		int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
		: m_bodies( bodies ), m_shapes( shapes ), 
		m_constraints( constraints ), 
		m_batchSizes(batchSizes),
		m_cellIndex(cellIndex),
		m_curWgidx(curWgidx),
		m_start( start ), 
		m_nConstraints( nConstraints ),
		m_solveFriction( true ),
		m_maxNumBatches(maxNumBatches)
	{}

	unsigned short int getType(){ return 0; }

	void run(int tIdx)
	{
		int offset = 0;
		for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
		{
			int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii);
			if (!numInBatch)
				break;

			for (int jj=0;jj<numInBatch;jj++)
			{
				int i = m_start + offset+jj;
				int batchId = m_constraints[i].m_batchIdx;
				b3Assert(batchId==ii);
				float frictionCoeff = m_constraints[i].getFrictionCoeff();
				int aIdx = (int)m_constraints[i].m_bodyA;
				int bIdx = (int)m_constraints[i].m_bodyB;
//				int localBatch = m_constraints[i].m_batchIdx;
				b3RigidBodyData& bodyA = m_bodies[aIdx];
				b3RigidBodyData& bodyB = m_bodies[bIdx];

				if( !m_solveFriction )
				{
					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};

					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
						maxRambdaDt, minRambdaDt );
				}
				else
				{
					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
					float sum = 0;
					for(int j=0; j<4; j++)
					{
						sum +=m_constraints[i].m_appliedRambdaDt[j];
					}
					frictionCoeff = 0.7f;
					for(int j=0; j<4; j++)
					{
						maxRambdaDt[j] = frictionCoeff*sum;
						minRambdaDt[j] = -maxRambdaDt[j];
					}
					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
						maxRambdaDt, minRambdaDt );
			
				}
			}
			offset+=numInBatch;


		}
/*		for (int bb=0;bb<m_maxNumBatches;bb++)
		{
			//for(int ic=m_nConstraints-1; ic>=0; ic--)
			for(int ic=0; ic<m_nConstraints; ic++)
			{
				
				int i = m_start + ic;
				if (m_constraints[i].m_batchIdx != bb)
					continue;

				float frictionCoeff = m_constraints[i].getFrictionCoeff();
				int aIdx = (int)m_constraints[i].m_bodyA;
				int bIdx = (int)m_constraints[i].m_bodyB;
				int localBatch = m_constraints[i].m_batchIdx;
				b3RigidBodyData& bodyA = m_bodies[aIdx];
				b3RigidBodyData& bodyB = m_bodies[bIdx];

				if( !m_solveFriction )
				{
					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};

					solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
							(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
						maxRambdaDt, minRambdaDt );
				}
				else
				{
					float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
					float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
					float sum = 0;
					for(int j=0; j<4; j++)
					{
						sum +=m_constraints[i].m_appliedRambdaDt[j];
					}
					frictionCoeff = 0.7f;
					for(int j=0; j<4; j++)
					{
						maxRambdaDt[j] = frictionCoeff*sum;
						minRambdaDt[j] = -maxRambdaDt[j];
					}
					solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
						(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
						maxRambdaDt, minRambdaDt );
			
				}
			}
		}
		*/


	}

	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
	b3AlignedObjectArray<b3InertiaData>& m_shapes;
	b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
	b3AlignedObjectArray<int>* m_batchSizes;
	int m_cellIndex;
	int m_curWgidx;
	int m_start;
	int m_nConstraints;
	bool m_solveFriction;
	int m_maxNumBatches;
};


void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes)
{

#if 0
	{	
		int nSplitX = B3_SOLVER_N_SPLIT_X;
		int nSplitY = B3_SOLVER_N_SPLIT_Y;
		int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
		for (int z=0;z<4;z++)
		{
			for (int y=0;y<4;y++)
			{
				for (int x=0;x<4;x++)
				{
					int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
				//	printf("newIndex=%d\n",newIndex);

					int zIdx = newIndex/(nSplitX*nSplitY);
					int remain = newIndex%(nSplitX*nSplitY);
					int yIdx = remain/nSplitX;
					int xIdx = remain%nSplitX;
				//	printf("newIndex=%d\n",newIndex);
				}
			}
		}

		//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
		for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
		{
			for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
			{
				int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
				int remain= (wgIdx%((nSplitX*nSplitY)/4));
				int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
				int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
				
				/*int zIdx = newIndex/(nSplitX*nSplitY);
				int remain = newIndex%(nSplitX*nSplitY);
				int yIdx = remain/nSplitX;
				int xIdx = remain%nSplitX;
				*/
				int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
			//	printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
			}
		}
	}
#endif

	b3AlignedObjectArray<b3RigidBodyData> bodyNative;
	bodyBuf->copyToHost(bodyNative);
	b3AlignedObjectArray<b3InertiaData> shapeNative;
	shapeBuf->copyToHost(shapeNative);
	b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
	constraint->copyToHost(constraintNative);

	b3AlignedObjectArray<unsigned int> numConstraintsHost;
	m_numConstraints->copyToHost(numConstraintsHost);

	//printf("------------------------\n");
	b3AlignedObjectArray<unsigned int> offsetsHost;
	m_offsets->copyToHost(offsetsHost);
	static int frame=0;
	bool useBatches=true;
	if (useBatches)
	{
		for(int iter=0; iter<m_nIterations; iter++)
		{
			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
			{
				
				int nSplitX = B3_SOLVER_N_SPLIT_X;
				int nSplitY = B3_SOLVER_N_SPLIT_Y;
				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
				//printf("cell Batch %d\n",cellBatch);
				b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
				for (int i=0;i<B3_SOLVER_N_CELLS;i++)
				{
					usedBodies[i].resize(0);
				}

				
				//for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
				{
					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
					int remain= (wgIdx%((nSplitX*nSplitY)/4));
					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
					
	
					if( numConstraintsHost[cellIdx] == 0 ) 
						continue;

					//printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
					//printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
					if (zIdx)
					{
					//printf("?\n");
					}

					if (iter==0)
					{
						//printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
						//printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
					}
					const int start = offsetsHost[cellIdx];
					int numConstraintsInCell = numConstraintsHost[cellIdx];
	//				const int end = start + numConstraintsInCell;

					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx);
					task.m_solveFriction = false;
					task.run(0);
				
				}
			}
		}

		for(int iter=0; iter<m_nIterations; iter++)
		{
			for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
			{
				int nSplitX = B3_SOLVER_N_SPLIT_X;
				int nSplitY = B3_SOLVER_N_SPLIT_Y;
				

				int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;

				for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
				{
					int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
					int remain= (wgIdx%((nSplitX*nSplitY)/4));
					int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
					int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
					
					int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
	
					if( numConstraintsHost[cellIdx] == 0 ) 
						continue;
	
					//printf("yIdx=%d\n",yIdx);
					
					const int start = offsetsHost[cellIdx];
					int numConstraintsInCell = numConstraintsHost[cellIdx];
	//				const int end = start + numConstraintsInCell;

					SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx);
					task.m_solveFriction = true;
					task.run(0);
					
				}
			}
		}


	} else
	{
		for(int iter=0; iter<m_nIterations; iter++)
		{
			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
			task.m_solveFriction = false;
			task.run(0);
		}

		for(int iter=0; iter<m_nIterations; iter++)
		{
			SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
			task.m_solveFriction = true;
			task.run(0);
		}
	}

	bodyBuf->copyFromHost(bodyNative);
	shapeBuf->copyFromHost(shapeNative);
	constraint->copyFromHost(constraintNative);
	frame++;
	
}

void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
					const b3OpenCLArray<b3InertiaData>* shapeBuf,
					b3OpenCLArray<b3GpuConstraint4>* constraint, 
					b3OpenCLArray<unsigned int>* m_numConstraints,
					b3OpenCLArray<unsigned int>* m_offsets,
					int batchId
					)
{
//						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
//						b3BufferInfoCL( m_offsets->getBufferCL() ) 
	
	int cellBatch = batchId;
	const int nn = B3_SOLVER_N_CELLS;
//	int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;

	b3AlignedObjectArray<unsigned int> gN;
	m_numConstraints->copyToHost(gN);
	b3AlignedObjectArray<unsigned int> gOffsets;
	m_offsets->copyToHost(gOffsets);
	int nSplitX = B3_SOLVER_N_SPLIT_X;
	int nSplitY = B3_SOLVER_N_SPLIT_Y;
	
//	int bIdx = batchId;

	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
	constraint->copyToHost(cpuConstraints);

	printf("batch = %d\n", batchId);

	int numWorkgroups = nn/B3_SOLVER_N_BATCHES;
	b3AlignedObjectArray<int> usedBodies;


	for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
	{
		printf("wgIdx = %d           ", wgIdx);

		int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2);					
		int remain = wgIdx%((nSplitX*nSplitY));
		int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1);
		int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1);

		
		int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
		printf("cellIdx=%d\n",cellIdx);
		if( gN[cellIdx] == 0 ) 
			continue;

		const int start = gOffsets[cellIdx];
		const int end = start + gN[cellIdx];

		for (int c=start;c<end;c++)
		{
			b3GpuConstraint4& constraint = cpuConstraints[c];
			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
			if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
			{
				printf("error?\n");
			}
			if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
			{
				printf("error?\n");
			}
		}

		for (int c=start;c<end;c++)
		{
			b3GpuConstraint4& constraint = cpuConstraints[c];
			usedBodies.push_back(constraint.m_bodyA);
			usedBodies.push_back(constraint.m_bodyB);
		}

	}
}

static bool verify=false;

void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
{
	
	
	b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
	{
		
		const int nn = B3_SOLVER_N_CELLS;

		cdata.x = 0;
		cdata.y = maxNumBatches;//250;


		int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
#ifdef DEBUG_ME
		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
#endif


		{

			B3_PROFILE("m_batchSolveKernel iterations");
			for(int iter=0; iter<m_nIterations; iter++)
			{
				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
				{
					
					if (verify)
					{
						checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
					}

#ifdef DEBUG_ME
					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
					gpuDebugInfo.write(debugInfo,numWorkItems);
#endif


					cdata.z = ib;
					

				b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel");
#if 1
                    
					b3BufferInfoCL bInfo[] = { 

						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
						b3BufferInfoCL( constraint->getBufferCL() ),
						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
						b3BufferInfoCL( m_offsets->getBufferCL() ) 
#ifdef DEBUG_ME
						,	b3BufferInfoCL(&gpuDebugInfo)
#endif
						};

					
                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
					//launcher.setConst(  cdata.x );
                    launcher.setConst(  cdata.y );
                    launcher.setConst(  cdata.z );
                    b3Int4 nSplit;
					nSplit.x = B3_SOLVER_N_SPLIT_X;
					nSplit.y = B3_SOLVER_N_SPLIT_Y;
					nSplit.z = B3_SOLVER_N_SPLIT_Z;

                    launcher.setConst(  nSplit );
                    launcher.launch1D( numWorkItems, 64 );

                    
#else
                    const char* fileName = "m_batchSolveKernel.bin";
                    FILE* f = fopen(fileName,"rb");
                    if (f)
                    {
                        int sizeInBytes=0;
                        if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
                        {
                            printf("error, cannot get file size\n");
                            exit(0);
                        }
                        
                        unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
                        fread(buf,sizeInBytes,1,f);
                        int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
                        int num = *(int*)&buf[serializedBytes];
                        
                        launcher.launch1D( num);

                        //this clFinish is for testing on errors
                        clFinish(m_queue);
                    }

#endif
					

#ifdef DEBUG_ME
					clFinish(m_queue);
					gpuDebugInfo.read(debugInfo,numWorkItems);
					clFinish(m_queue);
					for (int i=0;i<numWorkItems;i++)
					{
						if (debugInfo[i].m_valInt2>0)
						{
							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
						}

						if (debugInfo[i].m_valInt3>0)
						{
							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
						}
					}
#endif //DEBUG_ME


				}
			}
		
			clFinish(m_queue);


		}

		cdata.x = 1;
		bool applyFriction=true;
		if (applyFriction)
    	{
			B3_PROFILE("m_batchSolveKernel iterations2");
			for(int iter=0; iter<m_nIterations; iter++)
			{
				for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
				{
					cdata.z = ib;
					

					b3BufferInfoCL bInfo[] = { 
						b3BufferInfoCL( bodyBuf->getBufferCL() ), 
						b3BufferInfoCL( shapeBuf->getBufferCL() ), 
						b3BufferInfoCL( constraint->getBufferCL() ),
						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
						b3BufferInfoCL( m_offsets->getBufferCL() )
#ifdef DEBUG_ME
						,b3BufferInfoCL(&gpuDebugInfo)
#endif //DEBUG_ME
					};
					b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" );
					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
					//launcher.setConst(  cdata.x );
                    launcher.setConst(  cdata.y );
                    launcher.setConst(  cdata.z );
                    b3Int4 nSplit;
					nSplit.x = B3_SOLVER_N_SPLIT_X;
					nSplit.y = B3_SOLVER_N_SPLIT_Y;
					nSplit.z = B3_SOLVER_N_SPLIT_Z;

                    launcher.setConst(  nSplit );
                    
					launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
				}
			}
			clFinish(m_queue);
			
		}
#ifdef DEBUG_ME
		delete[] debugInfo;
#endif //DEBUG_ME
	}

	
}

void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
	const b3OpenCLArray<b3InertiaData>* shapeBuf, 
	b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
	int nContacts, const ConstraintCfg& cfg )
{
//	b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
	contactCOut->resize(nContacts);
	struct CB
	{
		int m_nContacts;
		float m_dt;
		float m_positionDrift;
		float m_positionConstraintCoeff;
	};

	{

		CB cdata;
		cdata.m_nContacts = nContacts;
		cdata.m_dt = cfg.m_dt;
		cdata.m_positionDrift = cfg.m_positionDrift;
		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;

		
		if (gConvertConstraintOnCpu)
		{
			b3AlignedObjectArray<b3RigidBodyData> gBodies;
		bodyBuf->copyToHost(gBodies);

		b3AlignedObjectArray<b3Contact4> gContact;
		contactsIn->copyToHost(gContact);

		b3AlignedObjectArray<b3InertiaData> gShapes;
		shapeBuf->copyToHost(gShapes);
		
		b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
		gConstraintOut.resize(nContacts);
		
			B3_PROFILE("cpu contactToConstraintKernel");
			for (int gIdx=0;gIdx<nContacts;gIdx++)
			{
				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);

				b3Float4 posA = gBodies[aIdx].m_pos;
				b3Float4 linVelA = gBodies[aIdx].m_linVel;
				b3Float4 angVelA = gBodies[aIdx].m_angVel;
				float invMassA = gBodies[aIdx].m_invMass;
				b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;

				b3Float4 posB = gBodies[bIdx].m_pos;
				b3Float4 linVelB = gBodies[bIdx].m_linVel;
				b3Float4 angVelB = gBodies[bIdx].m_angVel;
				float invMassB = gBodies[bIdx].m_invMass;
				b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;

				b3ContactConstraint4_t cs;

    			setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
					&gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
					&cs );
		
				cs.m_batchIdx = gContact[gIdx].m_batchIdx;

				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
			}

			contactCOut->copyFromHost(gConstraintOut);

		} else
		{
			B3_PROFILE("gpu m_contactToConstraintKernel");

		
			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
				b3BufferInfoCL( contactCOut->getBufferCL() )};
			b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
			//launcher.setConst(  cdata );
        
			launcher.setConst(cdata.m_nContacts);
			launcher.setConst(cdata.m_dt);
			launcher.setConst(cdata.m_positionDrift);
			launcher.setConst(cdata.m_positionConstraintCoeff);
        
			launcher.launch1D( nContacts, 64 );	
			clFinish(m_queue);

		}
	}

	
}

/*
void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
			b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData, 
			int nContacts, const b3Solver::ConstraintCfg& cfg )
{
	
	
	const int sortAlignment = 512; // todo. get this out of sort
	if( cfg.m_enableParallelSolve )
	{
		

		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );

		b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
		b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );

		{	//	2. set cell idx
			struct CB
			{
				int m_nContacts;
				int m_staticIdx;
				float m_scale;
				int m_nSplit;
			};

			b3Assert( sortSize%64 == 0 );
			CB cdata;
			cdata.m_nContacts = nContacts;
			cdata.m_staticIdx = cfg.m_staticIdx;
			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
			cdata.m_nSplit = B3_SOLVER_N_SPLIT;

			
			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
			b3LauncherCL launcher( m_queue, m_setSortDataKernel );
			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
			launcher.setConst(  cdata );
			launcher.launch1D( sortSize, 64 );
		}

		{	//	3. sort by cell idx
			int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
			int sortBit = 32;
			//if( n <= 0xffff ) sortBit = 16;
			//if( n <= 0xff ) sortBit = 8;
			m_sort32->execute(*m_sortDataBuffer,sortSize);
		}
		{	//	4. find entries
			m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);

			m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
		}

		{	//	5. sort constraints by cellIdx
			//	todo. preallocate this
//			b3Assert( contactsIn->getType() == TYPE_HOST );
//			b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer

			{
				

				b3Int4 cdata; cdata.x = nContacts;
				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
				b3LauncherCL launcher( m_queue, m_reorderContactKernel );
				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
				launcher.setConst(  cdata );
				launcher.launch1D( nContacts, 64 );
			}
//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
		}
	}

	
}

*/
void	b3Solver::batchContacts(  b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
{
	
	int numWorkItems = 64*B3_SOLVER_N_CELLS;
	{
		B3_PROFILE("batch generation");
		
		b3Int4 cdata;
		cdata.x = nContacts;
		cdata.y = 0;
		cdata.z = staticIdx;

		
#ifdef BATCH_DEBUG
		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
		adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
		gpuDebugInfo.write(debugInfo,numWorkItems);
#endif

		
#if 0
		b3BufferInfoCL bInfo[] = { 
			b3BufferInfoCL( contacts->getBufferCL() ), 
			b3BufferInfoCL(  m_contactBuffer2->getBufferCL()),
			b3BufferInfoCL( nNative->getBufferCL() ), 
			b3BufferInfoCL( offsetsNative->getBufferCL() ),
#ifdef BATCH_DEBUG
			,	b3BufferInfoCL(&gpuDebugInfo)
#endif
		};
#endif
		
		
		{
			m_batchSizes.resize(nNative->size());
			B3_PROFILE("batchingKernel");
			//b3LauncherCL launcher( m_queue, m_batchingKernel);
			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;

			b3LauncherCL launcher( m_queue, k,"*batchingKernel");
			if (!useNewBatchingKernel )
			{
				launcher.setBuffer( contacts->getBufferCL() );
			}
			launcher.setBuffer( m_contactBuffer2->getBufferCL() );
			launcher.setBuffer( nNative->getBufferCL());
			launcher.setBuffer( offsetsNative->getBufferCL());
			
			launcher.setBuffer(m_batchSizes.getBufferCL());
			

			//launcher.setConst(  cdata );
            launcher.setConst(staticIdx);
            
			launcher.launch1D( numWorkItems, 64 );
			//clFinish(m_queue);
			//b3AlignedObjectArray<int> batchSizesCPU;
			//m_batchSizes.copyToHost(batchSizesCPU);
			//printf(".\n");
		}

#ifdef BATCH_DEBUG
	aaaa
		b3Contact4* hostContacts = new b3Contact4[nContacts];
		m_contactBuffer->read(hostContacts,nContacts);
		clFinish(m_queue);

		gpuDebugInfo.read(debugInfo,numWorkItems);
		clFinish(m_queue);

		for (int i=0;i<numWorkItems;i++)
		{
			if (debugInfo[i].m_valInt1>0)
			{
				printf("catch\n");
			}
			if (debugInfo[i].m_valInt2>0)
			{
				printf("catch22\n");
			}

			if (debugInfo[i].m_valInt3>0)
			{
				printf("catch666\n");
			}

			if (debugInfo[i].m_valInt4>0)
			{
				printf("catch777\n");
			}
		}
		delete[] debugInfo;
#endif //BATCH_DEBUG

	}

//	copy buffer to buffer
	//b3Assert(m_contactBuffer->size()==nContacts);
	//contacts->copyFromOpenCLArray( *m_contactBuffer);
	//clFinish(m_queue);//needed?
	
	
}