#include "StdAfx.h"
#include "../Layer0/CCryDXPS.hpp"

#ifdef CRY_DXPS_RASTERTHREAD

#include "CCryDXPSRDRasterizer.hpp"
#include "CCryDXPSRDRastMath.hpp"

#if defined(PS3) && !defined(__SPU__) && !defined(__CRYCG__)
#include <PPU/ProdConsQueue.h>
DECLARE_SPU_CLASS_JOB("DXPSRasterize", TRasterizeJob, CDXPSRDRasterizer)
typedef TRasterizeJob::packet TRasterizeJobPacket;
#define USE_SPU
namespace
{
	ILINE PROD_CONS_QUEUE_TYPE(TRasterizeJob, DXPS_RASTERIZER_JOBCOUNT) & GetProdConsQueue()
	{
		static PROD_CONS_QUEUE_TYPE(TRasterizeJob, DXPS_RASTERIZER_JOBCOUNT) g_ProdConsQueue;
		return g_ProdConsQueue;
	}
}
#endif

#define memtransfer_sync_id(id) memtransfer_sync(id)
/*#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define memtransfer_sync_id(id)({\
	SPU_FRAME_PROFILER("memtransfer_sync" TOSTRING(__LINE__))\
	memtransfer_sync(id);})
*/
extern uint32 g_ForceStopSPUs;

//#define TEST_STRIDE
#ifdef TEST_STRIDE
	#define IF_STRIDE(a) IF((a),1)
	#define WARN_UNSUPP_STRIDE printf("Unusupported stride: %d\n",VertexStridePos);snPause();
#else
	#define IF_STRIDE(a) if(1)
	#define WARN_UNSUPP_STRIDE
#endif

//index buffer cache 512byte
#define DXPSRAS_IBC_Offset					(0)
#define DXPSRAS_IBC_BlockSize				(128)
#define DXPSRAS_IBC_BlockSizeByte		(DXPSRAS_IBC_BlockSize*sizeof(uint16))
#define DXPSRAS_IBC_BlockCount			(2)	//2 for double buffering
#define DXPSRAS_IBC_Mask						(DXPSRAS_IBC_BlockSize*DXPSRAS_IBC_BlockCount-1)
#define DXPSRAS_IBC_TransID					(0)
#define DXPSRAS_IBC_End							(DXPSRAS_IBC_Offset+DXPSRAS_IBC_BlockSizeByte*DXPSRAS_IBC_BlockCount)

//vertex buffer cache 8192byte
#define DXPSRAS_VBC_Offset					(DXPSRAS_IBC_End)
#define DXPSRAS_VBC_BlockBits				(7)
#define DXPSRAS_VBC_BlockSize				(1<<DXPSRAS_VBC_BlockBits)
#define DXPSRAS_VBC_BlockSizeByte		(DXPSRAS_VBC_BlockSize*sizeof(qword))
#define DXPSRAS_VBC_BlockCount			(64)
#define DXPSRAS_VBC_BlockCountMask	(DXPSRAS_VBC_BlockCount-1)
#define DXPSRAS_VBC_BlockSizeMask		(DXPSRAS_VBC_BlockSize-1)
#define DXPSRAS_VBC_BlockMask				(DXPSRAS_VBC_BlockSize*DXPSRAS_VBC_BlockCount-1)
#define DXPSRAS_VBC_End							(DXPSRAS_VBC_Offset+DXPSRAS_VBC_BlockSizeByte*DXPSRAS_VBC_BlockCount)

//Triangle Zbuffer cache and rasterization constants
#define DXPSRAS_ZB_Offset						(DXPSRAS_VBT_End)
#define DXPSRAS_ZB_TransID					(6)
#define DXPSRAS_ZB_BlockCountBitsX	(4)
#define DXPSRAS_ZB_BlockCountBitsY	(4)
#define DXPSRAS_ZB_BlockCountX			(1<<DXPSRAS_ZB_BlockCountBitsX)
#define DXPSRAS_ZB_BlockCountY			(1<<DXPSRAS_ZB_BlockCountBitsY)
#define DXPSRAS_ZB_BlockCountMaskX	(DXPSRAS_ZB_BlockCountX-1)
#define DXPSRAS_ZB_BlockCountMaskY	(DXPSRAS_ZB_BlockCountY-1)
#define DXPSRAS_ZB_BlockCount				(DXPSRAS_ZB_BlockCountX*DXPSRAS_ZB_BlockCountY)
#define DXPSRAS_ZB_BlockSizeBitsX		(3)
#define DXPSRAS_ZB_BlockSizeBitsY		(3)
#define DXPSRAS_ZB_BlockSizeX				(1<<DXPSRAS_ZB_BlockSizeBitsX)//8
#define DXPSRAS_ZB_BlockSizeY				(1<<DXPSRAS_ZB_BlockSizeBitsY)
#define DXPSRAS_ZB_BlockSizeMaskX		(DXPSRAS_ZB_BlockSizeX-1)
#define DXPSRAS_ZB_BlockSizeMaskY		(DXPSRAS_ZB_BlockSizeY-1)
#define DXPSRAS_ZB_BlockSize				(DXPSRAS_ZB_BlockSizeX*DXPSRAS_ZB_BlockSizeY)
#define DXPSRAS_ZB_BlockSizeByte		(DXPSRAS_ZB_BlockSize*sizeof(tdDXPSRDepth))
#define DXPSRAS_ZB_BufferOffset			(DXPSRAS_ZB_Offset+DXPSRAS_ZB_BlockSizeByte*DXPSRAS_ZB_BlockCount)
#define DXPSRAS_ZB_End							(DXPSRAS_ZB_BufferOffset+DXPSRAS_ZB_BlockSizeByte*2)//*2 for doublebuffering to get prefetching

//used for Clear and framebuffer copy
#define DXPSRAS_BlockSize						(512*8/4)														//div by 4 cause of 4float per quad
#define DXPSRAS_BlockSizeByte				(DXPSRAS_BlockSize*sizeof(qword))		//mul by 8 cause of 8x8 FB-tile-size
#define DXPSRAS_BlockCount					(4)

//vertex buffer temporary cache, 4*2kb
#define DXPSRAS_VBT_Offset					(DXPSRAS_VBC_End)
#define DXPSRAS_VBT_TransID					(2)
#define DXPSRAS_VBT_BlockSizeByte		(2048)//2kb transfer size
#define DXPSRAS_VBT_BlockCount			(4)
#define DXPSRAS_VBT_End							(DXPSRAS_VBT_Offset+DXPSRAS_VBT_BlockSizeByte*DXPSRAS_VBT_BlockCount)

#define DXPSRAS_DCBufferSize				(DXPS_RASTERIZER_DRAWCALLCOUNT+128/sizeof(void*)/*safety area for dma alignment*/)

#define BYPASS_CACHE

#if defined(DXPSR_PROFILE_TIMINGS) || !defined(BYPASS_CACHE)
	#define DXPSRAS_BufferSize					(201*1024)
#else
	#define DXPSRAS_BufferSize					((201+8)*1024)
#endif

#if !defined(__SPU__)
void CDXPSRDRasterizer::Finish()
{
#if defined(USE_SPU)
		GetProdConsQueue().WaitFinished();
#endif
}

void CDXPSRDRasterizer::Notify()
{
#if defined(USE_SPU)
	CDXPSRJob* pJob	=	&m_Job;
	TRasterizeJobPacket drawPacket( pJob );
	drawPacket.SetClassInstance( *this );

#if defined(DXPSR_PROFILE_TIMINGS) || !defined(BYPASS_CACHE)
	const NPPU::ECacheMode cacheMode = NPPU::eCM_4;
#else
	const NPPU::ECacheMode cacheMode = NPPU::eCM_None;
#endif
	GetProdConsQueue().AddPacket( drawPacket, CACHE_MIN_STACK_SIZE, cacheMode);

GetProdConsQueue().WaitFinished();
#endif
}

CDXPSRDRasterizer::CDXPSRDRasterizer()
//:m_CMDBPut(0)
{
memset(this,0,sizeof(this));
	printf("Rasterizer: used/avail SPU MEM:%d/%d kb\n",DXPSRAS_ZB_End>>10,DXPSRAS_BufferSize>>10);
	printf("VBC %d kb   ZB %d kb   VBT %d kb\n",(DXPSRAS_VBC_End-DXPSRAS_VBC_Offset)>>10,(DXPSRAS_ZB_End-DXPSRAS_ZB_Offset)>>10,(DXPSRAS_VBT_End-DXPSRAS_VBT_Offset)>>10);
}

#else//__SPU__

void* DXPSRAS_DCBuffer[DXPSRAS_DCBufferSize] _ALIGN(128)	SPU_LOCAL;
uint8 DXPSRAS_Buffer[DXPSRAS_BufferSize] _ALIGN(128)	SPU_LOCAL;
SPU_LOCAL tdDXPSRDepth*				g_pZBuffer _ALIGN(16);

SPU_LOCAL	uint32 DXPSRAS_VBC_CacheEntry[DXPSRAS_VBC_BlockCount] _ALIGN(16);

SPU_LOCAL			uint16				DXPSRAS_ZB_CacheEntry[DXPSRAS_ZB_BlockCount]	_ALIGN(16);
SPU_LOCAL			uint16				DXPSRAS_ZB_CachePrefetched[2]									_ALIGN(16);

#if defined(DXPSR_PROFILE_DETAILED)
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_F0;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_F1;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_C0;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_C1;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_S0;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_S1;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_W0;
SPU_LOCAL uint32 DXPSRAS_GLOBAL_PERFCOUNTER_W1;
#endif


#ifdef DXPSR_PROFILE_TIMINGS
	SPU_LOCAL SDXPSRasStats g_Stats;
#endif

template<uint32 N>
class CDXPSClearVBCache
{
public:
	RILNE static void Clear()
	{
		CDXPSClearVBCache<N-1>::Clear();
		DXPSRAS_VBC_CacheEntry[N-1]=~0u;
	}
};

template<>
class CDXPSClearVBCache<0>
{
public:
	RILNE static void Clear(){}
};

RILNE void VBCacheClear()
{
	CDXPSClearVBCache<DXPSRAS_VBC_BlockCount>::Clear();
};

RILNE uint32 VBCacheID(uint32 Idx)
{
	return (Idx>>DXPSRAS_VBC_BlockBits)&DXPSRAS_VBC_BlockCountMask;
}
RILNE bool VBInCache(uint32 Idx)
{
	return DXPSRAS_VBC_CacheEntry[VBCacheID(Idx)]==Idx>>DXPSRAS_VBC_BlockBits;
}
RILNE VOID VBUpdateCache(uint32 Idx)
{
	DXPSRAS_VBC_CacheEntry[VBCacheID(Idx)]=Idx>>DXPSRAS_VBC_BlockBits;
}

template<uint32 VertexStridePos>
RILNE void CDXPSRDRasterizer::VertexAssembly(	 				qword* __restrict	pDst,
																								const	qword* __restrict	pMat,
																								const	void*	 __restrict	pVtxBufferPos,
																											uint32						Idx)
{
	SWIZZLEMASK1

	DXPSR_PROFILE_VASTALL(g_Stats.m_VBVertexAssembly)
	const	uint8* __restrict	pSrc	=	reinterpret_cast<const uint8*>(pVtxBufferPos);
	uint8*	__restrict	pTmp	=	reinterpret_cast<uint8*>(DXPSRAS_Buffer+DXPSRAS_VBT_Offset);

	pSrc	+=	(Idx&~DXPSRAS_VBC_BlockSizeMask)*VertexStridePos;
	uint32 Offset	=	reinterpret_cast<const size_t>(pSrc)&15;
	pSrc	-=	Offset;

	uint32 SyncID	=	DXPSRAS_VBT_TransID;
	switch((VertexStridePos+15)>>4)	//   div by sizeof(qword)
	{
		//shouldn't have a vertex size which is bigger than 64byte
		case	4:		memtransfer_from_main(pTmp,pSrc,DXPSRAS_VBT_BlockSizeByte,SyncID++);
								pSrc+=DXPSRAS_VBT_BlockSizeByte;
								pTmp+=DXPSRAS_VBT_BlockSizeByte;
		case	3:		memtransfer_from_main(pTmp,pSrc,DXPSRAS_VBT_BlockSizeByte,SyncID++);
								pSrc+=DXPSRAS_VBT_BlockSizeByte;
								pTmp+=DXPSRAS_VBT_BlockSizeByte;
		case	2:		memtransfer_from_main(pTmp,pSrc,DXPSRAS_VBT_BlockSizeByte,SyncID++);
								pSrc+=DXPSRAS_VBT_BlockSizeByte;
								pTmp+=DXPSRAS_VBT_BlockSizeByte;
		case	1:		memtransfer_from_main(pTmp,pSrc,DXPSRAS_VBT_BlockSizeByte,SyncID++);
		break;
	}


	pTmp	=	reinterpret_cast<uint8*>(DXPSRAS_Buffer+DXPSRAS_VBT_Offset)+Offset;
	SyncID=	DXPSRAS_VBT_TransID;

	register const qword	M0	=	pMat[0];
	register const qword	M1	=	pMat[1];
	register const qword	M2	=	pMat[2];
	register const qword	M3	=	pMat[3];

	if(VertexStridePos==24)
	{


		Offset+=VertexStridePos*3;
		for(uint32 a=0,BlockEnd	=	0;a<DXPSRAS_VBC_BlockSize;)
		{
			{
				DXPSR_PROFILE_VASTALL(g_Stats.m_VBVertexAssemblyStall)
				memtransfer_sync_id(SyncID++);
			}
			BlockEnd	+=	DXPSRAS_VBT_BlockSizeByte;
			do
			{
				const qword T0	=	*reinterpret_cast<qword*>(pTmp);		//x0 y0 z0  0
				const qword T1	=	*reinterpret_cast<qword*>(pTmp+16);	// 0  0 x1 y1
				const qword T2	=	*reinterpret_cast<qword*>(pTmp+32);	//z1  0 0   0
				const qword T3	=	*reinterpret_cast<qword*>(pTmp+48);	//x2 y2 z2  0
				const qword T4	=	*reinterpret_cast<qword*>(pTmp+64);	// 0  0 x3 y3
				const qword T5	=	*reinterpret_cast<qword*>(pTmp+80);	//z3  0 0   0
				pTmp	+=	96;	//4*VStride

				qword qX0		=	V_Swizzle(T0,xxxx);
				qword qY0		=	V_Swizzle(T0,yyyy);
				qword qZ0		=	V_Swizzle(T0,zzzz);
				qword qX1		=	V_Swizzle(T1,zzzz);
				qword qY1		=	V_Swizzle(T1,wwww);
				qword qZ1		=	V_Swizzle(T2,xxxx);
				qword qX2		=	V_Swizzle(T3,xxxx);
				qword qY2		=	V_Swizzle(T3,yyyy);
				qword qZ2		=	V_Swizzle(T3,zzzz);
				qword qX3		=	V_Swizzle(T4,zzzz);
				qword qY3		=	V_Swizzle(T4,wwww);
				qword qZ3		=	V_Swizzle(T5,xxxx);
				*pDst++	=	V_Madd(M0,qX0,V_Madd(M1,qY0,V_Madd(M2,qZ0,M3)));
				*pDst++	=	V_Madd(M0,qX1,V_Madd(M1,qY1,V_Madd(M2,qZ1,M3)));
				*pDst++	=	V_Madd(M0,qX2,V_Madd(M1,qY2,V_Madd(M2,qZ2,M3)));
				*pDst++	=	V_Madd(M0,qX3,V_Madd(M1,qY3,V_Madd(M2,qZ3,M3)));

				Offset	+=	VertexStridePos*4;
				a+=4;
			}WHILE(a<DXPSRAS_VBC_BlockSize && Offset<BlockEnd,1);
		}

	}
	else
	{
		Offset+=VertexStridePos*3;
		for(uint32 a=0,BlockEnd	=	0;a<DXPSRAS_VBC_BlockSize;)
		{
			{
				DXPSR_PROFILE_VASTALL(g_Stats.m_VBVertexAssemblyStall)
				memtransfer_sync_id(SyncID++);
			}
			BlockEnd	+=	DXPSRAS_VBT_BlockSizeByte;
			do
			{
				const float X0	=	*reinterpret_cast<float*>(pTmp);
				const float Y0	=	*reinterpret_cast<float*>(pTmp+4);
				const float Z0	=	*reinterpret_cast<float*>(pTmp+8);
				pTmp	+=	VertexStridePos;
				const float X1	=	*reinterpret_cast<float*>(pTmp);
				const float Y1	=	*reinterpret_cast<float*>(pTmp+4);
				const float Z1	=	*reinterpret_cast<float*>(pTmp+8);
				pTmp	+=	VertexStridePos;
				const float X2	=	*reinterpret_cast<float*>(pTmp);
				const float Y2	=	*reinterpret_cast<float*>(pTmp+4);
				const float Z2	=	*reinterpret_cast<float*>(pTmp+8);
				pTmp	+=	VertexStridePos;
				const float X3	=	*reinterpret_cast<float*>(pTmp);
				const float Y3	=	*reinterpret_cast<float*>(pTmp+4);
				const float Z3	=	*reinterpret_cast<float*>(pTmp+8);
				pTmp	+=	VertexStridePos;
				qword qX0		=	V_F32Splat(X0);
				qword qY0		=	V_F32Splat(Y0);
				qword qZ0		=	V_F32Splat(Z0);
				qword qX1		=	V_F32Splat(X1);
				qword qY1		=	V_F32Splat(Y1);
				qword qZ1		=	V_F32Splat(Z1);
				qword qX2		=	V_F32Splat(X2);
				qword qY2		=	V_F32Splat(Y2);
				qword qZ2		=	V_F32Splat(Z2);
				qword qX3		=	V_F32Splat(X3);
				qword qY3		=	V_F32Splat(Y3);
				qword qZ3		=	V_F32Splat(Z3);
				*pDst++	=	V_Madd(M0,qX0,V_Madd(M1,qY0,V_Madd(M2,qZ0,M3)));
				*pDst++	=	V_Madd(M0,qX1,V_Madd(M1,qY1,V_Madd(M2,qZ1,M3)));
				*pDst++	=	V_Madd(M0,qX2,V_Madd(M1,qY2,V_Madd(M2,qZ2,M3)));
				*pDst++	=	V_Madd(M0,qX3,V_Madd(M1,qY3,V_Madd(M2,qZ3,M3)));

				Offset	+=	VertexStridePos*4;
				a+=4;
			}WHILE(a<DXPSRAS_VBC_BlockSize && Offset<BlockEnd,1);
		}
	}
}

RILNE void RasterizeTriangle(qword V0,qword V1,qword V2);

RILNE void CDXPSRDRasterizer::Rasterize24FullCached(SPU_DOMAIN_LOCAL const	Matrix44A& rViewMat,SPU_DOMAIN_MAIN const	uint16* pIndices,uint32 IndexCount,SPU_DOMAIN_MAIN const	void* pVtxBufferPos,uint16 VertexStridePos,uint32 VertexCount,bool Quickpath)
{
	SWIZZLEMASK1

	Matrix44A ViewMat	=	rViewMat;

	const uint16*	pIBufferSrc	=	pIndices;
	uint16*	pIBufferDst	=	reinterpret_cast<uint16*>(DXPSRAS_Buffer+DXPSRAS_IBC_Offset);
	qword*	pVBufferDst	=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_VBC_Offset);
	
	//to assure alligned dma transfers
	uint32 a	=	(reinterpret_cast<const size_t>(pIBufferSrc)&15)/2;
	pIBufferSrc	-=	a;
	IndexCount	+=	a;

	memtransfer_from_main(pIBufferDst,											pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,DXPSRAS_IBC_TransID);	pIBufferSrc+=DXPSRAS_IBC_BlockSize;
	memtransfer_from_main(pIBufferDst+DXPSRAS_IBC_BlockSize,pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,DXPSRAS_IBC_TransID^1);pIBufferSrc+=DXPSRAS_IBC_BlockSize;

	uint32	BlockEnd	=	DXPSRAS_IBC_BlockSize;
	uint32	SyncID		=	DXPSRAS_IBC_TransID;
	qword* __restrict	pVBCache	=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_VBC_Offset);


#ifdef DXPSR_PROFILE_TIMINGS
	asm volatile("nop");
	uint32 T0	= rdtsc();
	asm volatile("nop");
#endif
	if(Quickpath)
	{
		qword* pMat	=	reinterpret_cast<qword*>(&ViewMat);
		register const qword	M0	=	pMat[0];
		register const qword	M1	=	pMat[1];
		register const qword	M2	=	pMat[2];
		register const qword	M3	=	pMat[3];

		qword* __restrict	pDst	=	pVBCache;
		uint32 a=0;
		do
		{
				qword& T0	=	pDst[a];
				qword& T1	=	pDst[a+1];
				qword& T2	=	pDst[a+2];
				qword& T3	=	pDst[a+3];
				qword& T4	=	pDst[a+4];
				qword& T5	=	pDst[a+5];
				qword& T6	=	pDst[a+6];
				qword& T7	=	pDst[a+7];
				a+=8;

				qword qX0		=	V_Swizzle(T0,xxxx);
				qword qY0		=	V_Swizzle(T0,yyyy);
				qword qZ0		=	V_Swizzle(T0,zzzz);
				qword qW0		=	V_Swizzle(T0,wwww);
				qword qX1		=	V_Swizzle(T1,xxxx);
				qword qY1		=	V_Swizzle(T1,yyyy);
				qword qZ1		=	V_Swizzle(T1,zzzz);
				qword qW1		=	V_Swizzle(T1,wwww);
				qword qX2		=	V_Swizzle(T2,xxxx);
				qword qY2		=	V_Swizzle(T2,yyyy);
				qword qZ2		=	V_Swizzle(T2,zzzz);
				qword qW2		=	V_Swizzle(T2,wwww);
				qword qX3		=	V_Swizzle(T3,xxxx);
				qword qY3		=	V_Swizzle(T3,yyyy);
				qword qZ3		=	V_Swizzle(T3,zzzz);
				qword qW3		=	V_Swizzle(T3,wwww);
				qword qX4		=	V_Swizzle(T4,xxxx);
				qword qY4		=	V_Swizzle(T4,yyyy);
				qword qZ4		=	V_Swizzle(T4,zzzz);
				qword qW4		=	V_Swizzle(T4,wwww);
				qword qX5		=	V_Swizzle(T5,xxxx);
				qword qY5		=	V_Swizzle(T5,yyyy);
				qword qZ5		=	V_Swizzle(T5,zzzz);
				qword qW5		=	V_Swizzle(T5,wwww);
				qword qX6		=	V_Swizzle(T6,xxxx);
				qword qY6		=	V_Swizzle(T6,yyyy);
				qword qZ6		=	V_Swizzle(T6,zzzz);
				qword qW6		=	V_Swizzle(T6,wwww);
				qword qX7		=	V_Swizzle(T7,xxxx);
				qword qY7		=	V_Swizzle(T7,yyyy);
				qword qZ7		=	V_Swizzle(T7,zzzz);
				qword qW7		=	V_Swizzle(T7,wwww);
				T0	=	V_Madd(M0,qX0,V_Madd(M1,qY0,V_Madd(M2,qZ0,V_Mul(qW0,M3))));
				T1	=	V_Madd(M0,qX1,V_Madd(M1,qY1,V_Madd(M2,qZ1,V_Mul(qW1,M3))));
				T2	=	V_Madd(M0,qX2,V_Madd(M1,qY2,V_Madd(M2,qZ2,V_Mul(qW2,M3))));
				T3	=	V_Madd(M0,qX3,V_Madd(M1,qY3,V_Madd(M2,qZ3,V_Mul(qW3,M3))));
				T4	=	V_Madd(M0,qX4,V_Madd(M1,qY4,V_Madd(M2,qZ4,V_Mul(qW4,M3))));
				T5	=	V_Madd(M0,qX5,V_Madd(M1,qY5,V_Madd(M2,qZ5,V_Mul(qW5,M3))));
				T6	=	V_Madd(M0,qX6,V_Madd(M1,qY6,V_Madd(M2,qZ6,V_Mul(qW6,M3))));
				T7	=	V_Madd(M0,qX7,V_Madd(M1,qY7,V_Madd(M2,qZ7,V_Mul(qW7,M3))));
		}WHILE(a<VertexCount,1);
	}
	else
	{
		for(uint32 a=0;a<VertexCount;a+=DXPSRAS_VBC_BlockSize)
			VertexAssembly<24>(	&pVBCache[VBCacheID(a)<<DXPSRAS_VBC_BlockBits],
													reinterpret_cast<qword*>(&ViewMat),
													pVtxBufferPos,a);
	}
#ifdef DXPSR_PROFILE_TIMINGS
	asm volatile("nop");
	uint32 T1	= rdtsc();
	asm volatile("nop");
	g_Stats.m_TimeRasVertex	+=	T0-T1;
#endif


	memtransfer_sync_id(SyncID);

	for(;a<IndexCount;a+=3)
	{
#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T0	= rdtsc();
			asm volatile("nop");
#endif
			//if we'll hit the next transfer block
			IF(a+2>=BlockEnd,0)
			{
				//sync to it
				memtransfer_sync_id(SyncID^1);

				BlockEnd	+=	DXPSRAS_IBC_BlockSize;

				//if we left the last block and there is still some block left, start it's transfer
				IF(BlockEnd<IndexCount,1)
				{
					memtransfer_from_main(pIBufferDst+(SyncID&1)*DXPSRAS_IBC_BlockSize,pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,SyncID);
					pIBufferSrc+=DXPSRAS_IBC_BlockSize;
					SyncID^=1;
				}
				else
					BlockEnd	=	IndexCount;
			}

#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T1	= rdtsc();
			asm volatile("nop");
#endif
			for(;a+2<BlockEnd;a+=3)
			{
				uint16 Idx0	=	pIBufferDst[a&DXPSRAS_IBC_Mask];
				uint16 Idx1	=	pIBufferDst[(a+1)&DXPSRAS_IBC_Mask];
				uint16 Idx2	=	pIBufferDst[(a+2)&DXPSRAS_IBC_Mask];
				qword V0	=	pVBCache[Idx0];
				qword V1	=	pVBCache[Idx1];
				qword V2	=	pVBCache[Idx2];
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);

				RasterizeTriangle(	V0,V1,V2);
			}
#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T2	= rdtsc();
			asm volatile("nop");
			g_Stats.m_TimeRasVertex	+=	T0-T1;
			g_Stats.m_TimeRasTriangle	+=	T1-T2;
#endif
	}
}



RILNE void CDXPSRDRasterizer::Rasterize(SPU_DOMAIN_LOCAL const	Matrix44A& rViewMat,SPU_DOMAIN_MAIN const	uint16* pIndices,uint32 IndexCount,SPU_DOMAIN_MAIN const	void* pVtxBufferPos,uint16 VertexStridePos)
{
	VBCacheClear();

	Matrix44A ViewMat	=	rViewMat;

	const uint16*	pIBufferSrc	=	pIndices;
	uint16*	pIBufferDst	=	reinterpret_cast<uint16*>(DXPSRAS_Buffer+DXPSRAS_IBC_Offset);
	qword*	pVBufferDst	=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_VBC_Offset);
	
	//to assure alligned dma transfers
	uint32 a	=	(reinterpret_cast<const size_t>(pIBufferSrc)&15)/2;
	pIBufferSrc	-=	a;
	IndexCount	+=	a;

	memtransfer_from_main(pIBufferDst,											pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,DXPSRAS_IBC_TransID);	pIBufferSrc+=DXPSRAS_IBC_BlockSize;
	memtransfer_from_main(pIBufferDst+DXPSRAS_IBC_BlockSize,pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,DXPSRAS_IBC_TransID^1);pIBufferSrc+=DXPSRAS_IBC_BlockSize;

	uint32	BlockEnd	=	DXPSRAS_IBC_BlockSize;
	uint32	SyncID		=	DXPSRAS_IBC_TransID;
	qword* __restrict	pVBCache	=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_VBC_Offset);

	memtransfer_sync_id(SyncID);

	for(;a<IndexCount;a+=3)
	{
#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T0	= rdtsc();
			asm volatile("nop");
#endif
			uint16	Idx0,Idx1,Idx2;

			//if we'll hit the next transfer block
			IF(a+2>=BlockEnd,0)
			{
				//sync to it
				memtransfer_sync_id(SyncID^1);
					
				BlockEnd	+=	DXPSRAS_IBC_BlockSize;

				//if we left the last block and there is still some block left, start it's transfer
				IF(BlockEnd<IndexCount,1)
				{
					memtransfer_from_main(pIBufferDst+(SyncID&1)*DXPSRAS_IBC_BlockSize,pIBufferSrc,DXPSRAS_IBC_BlockSizeByte,SyncID);
					pIBufferSrc+=DXPSRAS_IBC_BlockSize;
					SyncID^=1;
				}
			}

			Idx0	=	pIBufferDst[a&DXPSRAS_IBC_Mask];
			Idx1	=	pIBufferDst[(a+1)&DXPSRAS_IBC_Mask];
			Idx2	=	pIBufferDst[(a+2)&DXPSRAS_IBC_Mask];

			IF(!VBInCache(Idx0),0)
			{
				VBUpdateCache(Idx0);
				IF_STRIDE(VertexStridePos==24)
				{
					VertexAssembly<24>(	&pVBCache[VBCacheID(Idx0)<<DXPSRAS_VBC_BlockBits],
															reinterpret_cast<qword*>(&ViewMat),
															pVtxBufferPos,Idx0);
				}
				else
				{
					WARN_UNSUPP_STRIDE
				}
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheMiss)
			}
			else
			{
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);
			}
			qword V0	=	pVBCache[Idx0&DXPSRAS_VBC_BlockMask];
			IF(!VBInCache(Idx1),0)
			{
				VBUpdateCache(Idx1);
				IF_STRIDE(VertexStridePos==24)
				{
					VertexAssembly<24>(	&pVBCache[VBCacheID(Idx1)<<DXPSRAS_VBC_BlockBits],
															reinterpret_cast<qword*>(&ViewMat),
															pVtxBufferPos,Idx1);
				}
				else
				{
					WARN_UNSUPP_STRIDE
				}
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheMiss)
			}
			else
			{
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);
			}
			qword V1	=	pVBCache[Idx1&DXPSRAS_VBC_BlockMask];
			IF(!VBInCache(Idx2),0)
			{
				VBUpdateCache(Idx2);
				IF_STRIDE(VertexStridePos==24)
				{
					VertexAssembly<24>(	&pVBCache[VBCacheID(Idx2)<<DXPSRAS_VBC_BlockBits],
															reinterpret_cast<qword*>(&ViewMat),
															pVtxBufferPos,Idx2);
				}
				else
				{
					WARN_UNSUPP_STRIDE
				}
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheMiss)
			}
			else
			{
				DXPSR_PROFILE_VBCACHE(g_Stats.m_VBCacheHit);
			}
			qword V2	=	pVBCache[Idx2&DXPSRAS_VBC_BlockMask];
#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T1	= rdtsc();
			asm volatile("nop");
#endif
			RasterizeTriangle(	V0,V1,V2);
#ifdef DXPSR_PROFILE_TIMINGS
			asm volatile("nop");
			uint32 T2	= rdtsc();
			asm volatile("nop");
			g_Stats.m_TimeRasVertex	+=	T0-T1;
			g_Stats.m_TimeRasTriangle	+=	T1-T2;
#endif
	}
}

RILNE void ZBCacheClear()
{
//	CDXPSClearZBCache<DXPSRAS_ZB_BlockCount>::Clear();

	DXPSRAS_ZB_CachePrefetched[0]	=	
	DXPSRAS_ZB_CachePrefetched[1]	=	~0;
	for(uint32 a=0;a<DXPSRAS_ZB_BlockCount;a+=4)
	{
		DXPSRAS_ZB_CacheEntry[a]=
		DXPSRAS_ZB_CacheEntry[a+1]=
		DXPSRAS_ZB_CacheEntry[a+2]=
		DXPSRAS_ZB_CacheEntry[a+3]=~0;
	}
};

RILNE uint32 ZBCacheID(uint32 x,uint32 y)
{
	x&=DXPSRAS_ZB_BlockCountMaskX;
	y&=DXPSRAS_ZB_BlockCountMaskY;
	uint32 Idx= x|(y<<DXPSRAS_ZB_BlockCountBitsX);
	return Idx;
}

RILNE uint32 ZBCacheBank(uint32 x,uint32 y)
{
	uint32 Idx	=	x|y*(DXPS_RASTERIZER_RESOLUTION>>DXPSRAS_ZB_BlockSizeBitsX);
//	Idx	*=	DXPSRAS_ZB_BlockSize;
	return Idx;
}

RILNE  void ZBCacheTile(uint32 x,uint32 y,SPU_DOMAIN_LOCAL tdDXPSRDepth*& pIn,SPU_DOMAIN_LOCAL tdDXPSRDepth*& pOut,bool& NeedSync)
{
	SWIZZLEMASK1
	SPU_DOMAIN_LOCAL tdDXPSRDepth*	DXPSRAS_ZB_pCache		=	SPU_LOCAL_PTR(reinterpret_cast<tdDXPSRDepth*>(&DXPSRAS_Buffer[DXPSRAS_ZB_Offset]));
	SPU_DOMAIN_LOCAL tdDXPSRDepth*	DXPSRAS_ZB_pBuffer	=	SPU_LOCAL_PTR(reinterpret_cast<tdDXPSRDepth*>(&DXPSRAS_Buffer[DXPSRAS_ZB_BufferOffset]));

	const uint32 ID		=	ZBCacheID(x,y);
	const	uint32 Idx	=	ZBCacheBank(x,y);
	SPU_DOMAIN_LOCAL tdDXPSRDepth*	pLIn;
	SPU_DOMAIN_LOCAL tdDXPSRDepth*	pLOut;
	pLIn	=	
	pLOut	=	SPU_LOCAL_PTR(&DXPSRAS_ZB_pCache[ID*DXPSRAS_ZB_BlockSize]);
	NeedSync	=	DXPSRAS_ZB_CacheEntry[ID]!=Idx;
	IF(NeedSync,0)
	{
		//printf("Miss@[%4d]=%4d==%4d x:%4d y:%4d [%d|%d]\n",ID,DXPSRAS_ZB_CacheEntry[ID]/DXPSRAS_ZB_BlockSize,Idx/DXPSRAS_ZB_BlockSize,	
		//																											(x>>DXPSRAS_ZB_BlockSizeBitsX),
		//																											(y>>DXPSRAS_ZB_BlockSizeBitsY),
		//																											(DXPSRAS_ZB_CacheEntry[ID]/DXPSRAS_ZB_BlockSize)&((DXPS_RASTERIZER_RESOLUTION>>DXPSRAS_ZB_BlockSizeBitsX)-1),
		//																											((DXPSRAS_ZB_CacheEntry[ID]/DXPSRAS_ZB_BlockSize)/(DXPS_RASTERIZER_RESOLUTION>>DXPSRAS_ZB_BlockSizeBitsX)));
		DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCacheMiss);
		pLIn	=	DXPSRAS_ZB_pBuffer;
		memtransfer_from_main(pLIn,SPU_MAIN_PTR(g_pZBuffer+Idx*DXPSRAS_ZB_BlockSize),DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID);

		IF(DXPSRAS_ZB_CacheEntry[ID]!=(uint16)~0u,1)
		{
			memtransfer_to_main(SPU_MAIN_PTR(g_pZBuffer+((uint32)DXPSRAS_ZB_CacheEntry[ID])*DXPSRAS_ZB_BlockSize),pLOut,DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID);
		}

		DXPSRAS_ZB_CacheEntry[ID]=Idx;
	}
	else
	{
		DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCacheHit);
//		printf("Hit @[%4d]=%4d x:%4d y:%4d\n",ID,Idx,(x>>DXPSRAS_ZB_BlockSizeBitsX),(y>>DXPSRAS_ZB_BlockSizeBitsY));
	}
	pIn	=	pLIn;
	pOut=	pLOut;
}


RILNE void RasterizeTriangle(qword V0,qword V1,qword V2)
{
	SWIZZLEMASK1

/*	qword W0123=V_Shuffle(V_Shuffle(V0,V1,wwww),V2,xzww);
	uint32 W0123SignMask	=	V_SignMaskF32(W0123);
	IF(W0123SignMask,1)
		return;
*/
	const	qword P0	=	V_Div(V0,V_Swizzle(V0,wwww));
	const	qword P1	=	V_Div(V1,V_Swizzle(V1,wwww));
	const	qword P2	=	V_Div(V2,V_Swizzle(V2,wwww));


	const	qword	qXY31	=	V_Sub(P2,P0);
	const	qword	qXY21	=	V_Sub(P1,P0);
	const qword	qX31	=	V_Swizzle(qXY31,xxxx);
	const qword	qY31	=	V_Swizzle(qXY31,yyyy);
	const qword	qX21	=	V_Swizzle(qXY21,xxxx);
	const qword	qY21	=	V_Swizzle(qXY21,yyyy);

	const qword iP0		=	V_F32ToS32(P0);
	const	qword	qiXY31	=	si_sf(V_F32ToS32(P2),iP0);
	const	qword	qiXY21	=	si_sf(V_F32ToS32(P1),iP0);
	const qword	qiY31	=	V_Swizzle(qiXY31,yyyy);
	const qword	qiY21	=	V_Swizzle(qiXY21,yyyy);

	const	int		DetCull	=	((short)si_to_int(qiXY21))*((short)si_to_int(qiY31 ))-
												((short)si_to_int(qiXY31))*((short)si_to_int(qiY21 ));

	//const	qword qDetCull	=	si_mpya(V_F32ToS32(qX21),
	//																			V_F32ToS32(qY31),
	//																			si_mpyh(V_F32ToS32(qX31),
	//																							V_F32ToS32(qY21)));
	const	qword qDet	=	V_Msub(qX21,qY31,V_Mul(qX31,qY21));

	const qword qZ1		=	V_Swizzle(V0,zzzz);
	const qword qW1		=	V_Swizzle(V0,wwww);

	qword qMax				=	V_Max(V_Max(P0,P1),P2);
	qword qMin				=	V_Min(V_Min(P0,P1),P2);

	const qword	qX1		=	V_Swizzle(P0,xxxx);
	const qword	qY1		=	V_Swizzle(P0,yyyy);

	//const	qword	qBSMA	=	(qword)(vec_float4){(float)DXPSRAS_ZB_BlockSizeMaskX,
	//																				(float)DXPSRAS_ZB_BlockSizeMaskY,0.f,0.f};
	const	qword	qBSMA	=	(qword)(vec_float4){1.f-FLT_EPSILON+(float)DXPSRAS_ZB_BlockSizeMaskX,
																					1.f-FLT_EPSILON+(float)DXPSRAS_ZB_BlockSizeMaskY,0.f,0.f};
	qMax							=	V_Add(qMax,qBSMA);					//Merged add for round to ceil and blocksize for round ceil  to blocksize
	qMin							=	V_Max(qMin,qwordEpsilon());
	qMax							=	V_Min(qMax,qword512());
	qMin							=	V_F32ToS32(qMin);
	qMax							=	V_F32ToS32(qMax);

	IF(DetCull>=0,1)
		return;


	const	qword qiDet	=	V_Rcp(qDet);
	const qword qV321	=	V_Mul(V_Shuffle(V_Sub(V1,V0),V_Sub(V2,V0),zwzw),qiDet);

	///vectorized masking
	const	qword	qBSM	=	(qword)(vec_int4){~DXPSRAS_ZB_BlockSizeMaskX,~DXPSRAS_ZB_BlockSizeMaskY,0,0};
	qMin							=	V_And(qMin,qBSM);
	qMax							=	V_And(qMax,qBSM);
	int32 Minx				=	V_QToS32(qMin);
	int32 Miny				=	V_QToS32(V_Swizzle(qMin,yyyy));
	int32 Maxx				=	V_QToS32(qMax);
	int32 Maxy				=	V_QToS32(V_Swizzle(qMax,yyyy));
	//int32 Minx				=	V_QToS32(qMin)&~DXPSRAS_ZB_BlockSizeMaskX;
	//int32 Miny				=	V_QToS32(V_Swizzle(qMin,yyyy))&~DXPSRAS_ZB_BlockSizeMaskY;
	//int32 Maxx				=	(V_QToS32(qMax)+DXPSRAS_ZB_BlockSizeMaskX)&~DXPSRAS_ZB_BlockSizeMaskX;
	//int32 Maxy				=	(V_QToS32(V_Swizzle(qMax,yyyy))+DXPSRAS_ZB_BlockSizeMaskY)&~DXPSRAS_ZB_BlockSizeMaskY;

	const qword qZ21	=	V_Swizzle(qV321,xxxx);
	const qword qW21	=	V_Swizzle(qV321,yyyy);
	const qword qZ31	=	V_Swizzle(qV321,zzzz);
	const qword qW31	=	V_Swizzle(qV321,wwww);


	//qword	qOrSelect[16]	=	{				{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 0 0 0
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//1 0 0 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 1 0 0
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//1 1 0 0
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 0 1 0
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//1 0 1 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 1 1 0
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//1 1 1 0
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//0 0 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 0 0 1
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//0 1 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 1 0 1
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//0 0 1 1
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//1 0 1 1
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//0 1 1 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0}};	//1 1 1 1
	//qword	qOrSelect[16]	=	{				{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 0 0 0
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 0 0 0
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 1 0 0
	//															{0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//1 1 0 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 0 1 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 0 1 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 1 1 0
	//															{0x00,0x01,0x02,0x03,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//1 1 1 0
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 0 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 0 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 1 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0x04,0x05,0x06,0x07,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0},	//1 1 0 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F},	//0 0 1 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x08,0x09,0x0A,0x0B,0xC0,0xC0,0xC0,0xC0},	//1 0 1 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0x0C,0x0D,0x0E,0x0F},	//0 1 1 1
	//															{0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0,0xC0}};	//1 1 1 1


	const qword CoarseX	=	(qword)(vec_float4){0.f,(float)DXPSRAS_ZB_BlockSizeX,0.f,(float)DXPSRAS_ZB_BlockSizeX};
	const qword CoarseY	=	(qword)(vec_float4){0.f,0.f,(float)DXPSRAS_ZB_BlockSizeY,(float)DXPSRAS_ZB_BlockSizeY};
	const qword qBSizeX	=	(qword)(vec_float4){(float)DXPSRAS_ZB_BlockSizeX,(float)DXPSRAS_ZB_BlockSizeX,(float)DXPSRAS_ZB_BlockSizeX,(float)DXPSRAS_ZB_BlockSizeX};

#ifdef DXPSR_PROFILE_TIMINGS
	asm volatile("nop");
	uint32 T0	= rdtsc();
	asm volatile("nop");
#endif
//	printf("Tri\n");
	for(int32 Y=Miny;Y<Maxy;Y+=DXPSRAS_ZB_BlockSizeY)
	{
		const	qword qY	=	V_F32Splat(Y);
		const qword yf0	=	V_Sub(qY,qY1);
		const qword yfC	=	V_Add(yf0,CoarseY);
		const qword yf1	=	V_Add(yf0,qword1());
		const qword yf2	=	V_Add(yf0,qword2());
		const qword yf3	=	V_Add(yf0,qword3());
		const qword yf4	=	V_Add(yf0,qword4());
		const qword yf5	=	V_Add(yf0,qword5());
		const qword yf6	=	V_Add(yf0,qword6());
		const qword yf7	=	V_Add(yf0,qword7());

		qword qX	=	V_Sub(V_F32Splat(Minx),qX1);
		for(int32 X=Minx;X<Maxx;X+=DXPSRAS_ZB_BlockSizeX,qX=V_Add(qX,qBSizeX))
		{
			//coarse test
			const qword xfC			=	V_Add(qX,CoarseX);
			const qword	uC			=	V_Msub(qY31,xfC,V_Mul(qX31,yfC));
			const qword	vC			=	V_Msub(qX21,yfC,V_Mul(qY21,xfC));
			const	qword	MaskUC	=	V_CmpGt(qwordZero(),uC);
			const	qword	MaskVC	=	V_CmpGt(qwordZero(),vC);
			const	qword	MaskUVC	=	V_CmpGt(V_Add(uC,vC),qDet);
			const qword MaskC		=	V_And(V_And(MaskUC,MaskVC),MaskUVC);
			IF(V_SignMaskS32(MaskUC)==0 | V_SignMaskS32(MaskVC)==0 | V_SignMaskS32(MaskUVC)==0 ,0)
			{
				DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCoarseFail);
				continue;
			}
			SPU_DOMAIN_LOCAL tdDXPSRDepth* __restrict	pBufferIn;
			SPU_DOMAIN_LOCAL tdDXPSRDepth* __restrict	pBufferOut;

//			ZBCacheTile(X,Y,pBufferIn,pBufferOut,NeedSync);



			SPU_DOMAIN_LOCAL tdDXPSRDepth* __restrict	DXPSRAS_ZB_pCache		=	SPU_LOCAL_PTR(reinterpret_cast<tdDXPSRDepth*>(&DXPSRAS_Buffer[DXPSRAS_ZB_Offset]));

			uint32 x	=	X>>DXPSRAS_ZB_BlockSizeBitsX;
			uint32 y	=	Y>>DXPSRAS_ZB_BlockSizeBitsY;
			uint32 ID	=	ZBCacheID(x,y);
			uint32 Idx=	ZBCacheBank(x,y);
			pBufferIn	=	
			pBufferOut	=	SPU_LOCAL_PTR(&DXPSRAS_ZB_pCache[ID*DXPSRAS_ZB_BlockSize]);
			IF(DXPSRAS_ZB_CacheEntry[ID]!=Idx,0)
			{
				DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCacheMiss);
#if defined(DXPSR_PROFILE_DETAILED)
				asm volatile("nop");
				uint32 T0	= rdtsc();
				asm volatile("nop");
#endif

				uint32	PrefetchIdx	=	Idx&1;
				SPU_DOMAIN_LOCAL tdDXPSRDepth* __restrict	DXPSRAS_ZB_pBuffer	=	SPU_LOCAL_PTR(reinterpret_cast<tdDXPSRDepth*>(&DXPSRAS_Buffer[DXPSRAS_ZB_BufferOffset]));
				pBufferIn	=	&DXPSRAS_ZB_pBuffer[PrefetchIdx*DXPSRAS_ZB_BlockSize];
				if(DXPSRAS_ZB_CachePrefetched[PrefetchIdx]!=Idx)
				{
					memtransfer_from_main(pBufferIn,SPU_MAIN_PTR(g_pZBuffer+Idx*DXPSRAS_ZB_BlockSize),DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID+PrefetchIdx);

					IF(DXPSRAS_ZB_CacheEntry[ID]!=(uint16)~0u,1)
						memtransfer_to_main(SPU_MAIN_PTR(g_pZBuffer+((uint32)DXPSRAS_ZB_CacheEntry[ID])*DXPSRAS_ZB_BlockSize),pBufferOut,DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID+PrefetchIdx);
				}

				DXPSRAS_ZB_CacheEntry[ID]=Idx;
				DXPSRAS_ZB_CachePrefetched[PrefetchIdx]	=	~0;

				ID+=1;
				Idx+=1;
				if(X+1<Maxx && DXPSRAS_ZB_CacheEntry[ID]!=Idx)
				{
					PrefetchIdx^=1;

					SPU_DOMAIN_LOCAL tdDXPSRDepth* __restrict	pBufferTmp	=	&DXPSRAS_ZB_pBuffer[PrefetchIdx*DXPSRAS_ZB_BlockSize];
					memtransfer_from_main(pBufferTmp,SPU_MAIN_PTR(g_pZBuffer+Idx*DXPSRAS_ZB_BlockSize),DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID+PrefetchIdx);
					pBufferTmp	=	SPU_LOCAL_PTR(&DXPSRAS_ZB_pCache[ID*DXPSRAS_ZB_BlockSize]);
					IF(DXPSRAS_ZB_CacheEntry[ID]!=(uint16)~0u,1)
						memtransfer_to_main(SPU_MAIN_PTR(g_pZBuffer+((uint32)DXPSRAS_ZB_CacheEntry[ID])*DXPSRAS_ZB_BlockSize),pBufferTmp,DXPSRAS_ZB_BlockSizeByte,DXPSRAS_ZB_TransID+PrefetchIdx);

					DXPSRAS_ZB_CachePrefetched[PrefetchIdx]	=	Idx;
					PrefetchIdx^=1;
				}




				const qword xf0			=	V_Add(qX,qword0123());
				const qword xf1			=	V_Add(qX,qword4567());
				const qword xf0_31	=	V_Mul(xf0,qY31);
				const qword xf1_31	=	V_Mul(xf1,qY31);
				const qword xf0_21	=	V_Mul(xf0,qY21);
				const qword xf1_21	=	V_Mul(xf1,qY21);
				const qword	u00			=	V_NMsub(qX31,yf0,xf0_31);			const qword	u01			=	V_NMsub(qX31,yf0,xf1_31);
				const qword	u10			=	V_NMsub(qX31,yf1,xf0_31);			const qword	u11			=	V_NMsub(qX31,yf1,xf1_31);
				const qword	u20			=	V_NMsub(qX31,yf2,xf0_31);			const qword	u21			=	V_NMsub(qX31,yf2,xf1_31);
				const qword	u30			=	V_NMsub(qX31,yf3,xf0_31);			const qword	u31			=	V_NMsub(qX31,yf3,xf1_31);
				const qword	u40			=	V_NMsub(qX31,yf4,xf0_31);			const qword	u41			=	V_NMsub(qX31,yf4,xf1_31);
				const qword	u50			=	V_NMsub(qX31,yf5,xf0_31);			const qword	u51			=	V_NMsub(qX31,yf5,xf1_31);
				const qword	u60			=	V_NMsub(qX31,yf6,xf0_31);			const qword	u61			=	V_NMsub(qX31,yf6,xf1_31);
				const qword	u70			=	V_NMsub(qX31,yf7,xf0_31);			const qword	u71			=	V_NMsub(qX31,yf7,xf1_31);
				const qword	v00			=	V_Msub(qX21,yf0,xf0_21);			const qword	v01			=	V_Msub(qX21,yf0,xf1_21);
				const qword	v10			=	V_Msub(qX21,yf1,xf0_21);			const qword	v11			=	V_Msub(qX21,yf1,xf1_21);
				const qword	v20			=	V_Msub(qX21,yf2,xf0_21);			const qword	v21			=	V_Msub(qX21,yf2,xf1_21);
				const qword	v30			=	V_Msub(qX21,yf3,xf0_21);			const qword	v31			=	V_Msub(qX21,yf3,xf1_21);
				const qword	v40			=	V_Msub(qX21,yf4,xf0_21);			const qword	v41			=	V_Msub(qX21,yf4,xf1_21);
				const qword	v50			=	V_Msub(qX21,yf5,xf0_21);			const qword	v51			=	V_Msub(qX21,yf5,xf1_21);
				const qword	v60			=	V_Msub(qX21,yf6,xf0_21);			const qword	v61			=	V_Msub(qX21,yf6,xf1_21);
				const qword	v70			=	V_Msub(qX21,yf7,xf0_21);			const qword	v71			=	V_Msub(qX21,yf7,xf1_21);
				const qword Z00			=	V_Div(	V_Madd(u00,qZ21,V_Madd(v00,qZ31,qZ1)),V_Madd(u00,qW21,V_Madd(v00,qW31,qW1)));
				const qword Z01			=	V_Div(	V_Madd(u01,qZ21,V_Madd(v01,qZ31,qZ1)),V_Madd(u01,qW21,V_Madd(v01,qW31,qW1)));
				const qword Z20			=	V_Div(	V_Madd(u20,qZ21,V_Madd(v20,qZ31,qZ1)),V_Madd(u20,qW21,V_Madd(v20,qW31,qW1)));
				const qword Z21			=	V_Div(	V_Madd(u21,qZ21,V_Madd(v21,qZ31,qZ1)),V_Madd(u21,qW21,V_Madd(v21,qW31,qW1)));
				const qword Z40			=	V_Div(	V_Madd(u40,qZ21,V_Madd(v40,qZ31,qZ1)),V_Madd(u40,qW21,V_Madd(v40,qW31,qW1)));
				const qword Z41			=	V_Div(	V_Madd(u41,qZ21,V_Madd(v41,qZ31,qZ1)),V_Madd(u41,qW21,V_Madd(v41,qW31,qW1)));
				const qword Z60			=	V_Div(	V_Madd(u60,qZ21,V_Madd(v60,qZ31,qZ1)),V_Madd(u60,qW21,V_Madd(v60,qW31,qW1)));
				const qword Z61			=	V_Div(	V_Madd(u61,qZ21,V_Madd(v61,qZ31,qZ1)),V_Madd(u61,qW21,V_Madd(v61,qW31,qW1)));
				const qword Z70			=	V_Div(	V_Madd(u70,qZ21,V_Madd(v70,qZ31,qZ1)),V_Madd(u70,qW21,V_Madd(v70,qW31,qW1)));
				const qword Z71			=	V_Div(	V_Madd(u71,qZ21,V_Madd(v71,qZ31,qZ1)),V_Madd(u71,qW21,V_Madd(v71,qW31,qW1)));

	#if defined(DXPSR_FAST_ZINTERP)
				const qword qHALF		=	(qword)(vec_float4){0.5f,0.5f,0.5f,0.5f};
				const qword qTHIRD	=	(qword)(vec_float4){0.333f,0.333f,0.333f,0.333f};
				const qword q2THIRD	=	(qword)(vec_float4){0.667f,0.667f,0.667f,0.667f};
				const qword Z10			=	V_Madd(Z00,qHALF,V_Mul(Z20,qHALF));
				const qword Z11			=	V_Madd(Z01,qHALF,V_Mul(Z21,qHALF));
				const qword Z30			=	V_Madd(Z20,qHALF,V_Mul(Z40,qHALF));
				const qword Z31			=	V_Madd(Z21,qHALF,V_Mul(Z41,qHALF));
				const qword Z50			=	V_Madd(Z40,qHALF,V_Mul(Z60,qHALF));
				const qword Z51			=	V_Madd(Z41,qHALF,V_Mul(Z61,qHALF));
	#else
				const qword Z10			=	V_Div(	V_Madd(u10,qZ21,V_Madd(v10,qZ31,qZ1)),V_Madd(u10,qW21,V_Madd(v10,qW31,qW1)));
				const qword Z11			=	V_Div(	V_Madd(u11,qZ21,V_Madd(v11,qZ31,qZ1)),V_Madd(u11,qW21,V_Madd(v11,qW31,qW1)));
				const qword Z30			=	V_Div(	V_Madd(u30,qZ21,V_Madd(v30,qZ31,qZ1)),V_Madd(u30,qW21,V_Madd(v30,qW31,qW1)));
				const qword Z31			=	V_Div(	V_Madd(u31,qZ21,V_Madd(v31,qZ31,qZ1)),V_Madd(u31,qW21,V_Madd(v31,qW31,qW1)));
				const qword Z50			=	V_Div(	V_Madd(u50,qZ21,V_Madd(v50,qZ31,qZ1)),V_Madd(u50,qW21,V_Madd(v50,qW31,qW1)));
				const qword Z51			=	V_Div(	V_Madd(u51,qZ21,V_Madd(v51,qZ31,qZ1)),V_Madd(u51,qW21,V_Madd(v51,qW31,qW1)));
	#endif
				DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCoarseFineTest);
				//fine test needed
				//u<0?	(v>-inf?-inf:0) :			->0xffffffff
				//			(v>   0?-inf:0)
				qword	Mask00	=	V_CmpGt(qDet,V_Add(u00,v00));			qword	Mask01	=	V_CmpGt(qDet,V_Add(u01,v01));
				qword	Mask10	=	V_CmpGt(qDet,V_Add(u10,v10));			qword	Mask11	=	V_CmpGt(qDet,V_Add(u11,v11));
				qword	Mask20	=	V_CmpGt(qDet,V_Add(u20,v20));			qword	Mask21	=	V_CmpGt(qDet,V_Add(u21,v21));
				qword	Mask30	=	V_CmpGt(qDet,V_Add(u30,v30));			qword	Mask31	=	V_CmpGt(qDet,V_Add(u31,v31));
				qword	Mask40	=	V_CmpGt(qDet,V_Add(u40,v40));			qword	Mask41	=	V_CmpGt(qDet,V_Add(u41,v41));
				qword	Mask50	=	V_CmpGt(qDet,V_Add(u50,v50));			qword	Mask51	=	V_CmpGt(qDet,V_Add(u51,v51));
				qword	Mask60	=	V_CmpGt(qDet,V_Add(u60,v60));			qword	Mask61	=	V_CmpGt(qDet,V_Add(u61,v61));
				qword	Mask70	=	V_CmpGt(qDet,V_Add(u70,v70));			qword	Mask71	=	V_CmpGt(qDet,V_Add(u71,v71));
				Mask00	=	V_CmpGt(u00,Mask00);			Mask01	=	V_CmpGt(u01,Mask01);
				Mask10	=	V_CmpGt(u10,Mask10);			Mask11	=	V_CmpGt(u11,Mask11);
				Mask20	=	V_CmpGt(u20,Mask20);			Mask21	=	V_CmpGt(u21,Mask21);
				Mask30	=	V_CmpGt(u30,Mask30);			Mask31	=	V_CmpGt(u31,Mask31);
				Mask40	=	V_CmpGt(u40,Mask40);			Mask41	=	V_CmpGt(u41,Mask41);
				Mask50	=	V_CmpGt(u50,Mask50);			Mask51	=	V_CmpGt(u51,Mask51);
				Mask60	=	V_CmpGt(u60,Mask60);			Mask61	=	V_CmpGt(u61,Mask61);
				Mask70	=	V_CmpGt(u70,Mask70);			Mask71	=	V_CmpGt(u71,Mask71);
				Mask00	=	V_CmpGt(v00,Mask00);			Mask01	=	V_CmpGt(v01,Mask01);
				Mask10	=	V_CmpGt(v10,Mask10);			Mask11	=	V_CmpGt(v11,Mask11);
				Mask20	=	V_CmpGt(v20,Mask20);			Mask21	=	V_CmpGt(v21,Mask21);
				Mask30	=	V_CmpGt(v30,Mask30);			Mask31	=	V_CmpGt(v31,Mask31);
				Mask40	=	V_CmpGt(v40,Mask40);			Mask41	=	V_CmpGt(v41,Mask41);
				Mask50	=	V_CmpGt(v50,Mask50);			Mask51	=	V_CmpGt(v51,Mask51);
				Mask60	=	V_CmpGt(v60,Mask60);			Mask61	=	V_CmpGt(v61,Mask61);
				Mask70	=	V_CmpGt(v70,Mask70);			Mask71	=	V_CmpGt(v71,Mask71);
				{
#if defined(DXPSR_PROFILE_DETAILED)
					DXPSR_PROFILE_ZBSTALL(g_Stats.m_ZBStall)
					asm volatile("nop");
					uint32 T0	= rdtsc();
					asm volatile("nop");
#endif
					memtransfer_sync_id(DXPSRAS_ZB_TransID+PrefetchIdx);
#if defined(DXPSR_PROFILE_DETAILED)
					asm volatile("nop");
					uint32 T1	= rdtsc();
					asm volatile("nop");
					DXPSRAS_GLOBAL_PERFCOUNTER_C0	+=	T0-T1;
					DXPSRAS_GLOBAL_PERFCOUNTER_C1++;
#endif
				}
				qword rZ00					=	*reinterpret_cast<qword*>(pBufferIn);
				qword rZ01					=	*reinterpret_cast<qword*>(pBufferIn+4);
				qword rZ10					=	*reinterpret_cast<qword*>(pBufferIn+8);
				qword rZ11					=	*reinterpret_cast<qword*>(pBufferIn+12);
				qword rZ20					=	*reinterpret_cast<qword*>(pBufferIn+16);
				qword rZ21					=	*reinterpret_cast<qword*>(pBufferIn+20);
				qword rZ30					=	*reinterpret_cast<qword*>(pBufferIn+24);
				qword rZ31					=	*reinterpret_cast<qword*>(pBufferIn+28);
				qword rZ40					=	*reinterpret_cast<qword*>(pBufferIn+32);
				qword rZ41					=	*reinterpret_cast<qword*>(pBufferIn+36);
				qword rZ50					=	*reinterpret_cast<qword*>(pBufferIn+40);
				qword rZ51					=	*reinterpret_cast<qword*>(pBufferIn+44);
				qword rZ60					=	*reinterpret_cast<qword*>(pBufferIn+48);
				qword rZ61					=	*reinterpret_cast<qword*>(pBufferIn+52);
				qword rZ70					=	*reinterpret_cast<qword*>(pBufferIn+56);
				qword rZ71					=	*reinterpret_cast<qword*>(pBufferIn+60);
				Mask00							=	V_Or(Mask00,V_CmpGt(Z00,rZ00));	Mask01							=	V_Or(Mask01,V_CmpGt(Z01,rZ01));
				Mask10							=	V_Or(Mask10,V_CmpGt(Z10,rZ10));	Mask11							=	V_Or(Mask11,V_CmpGt(Z11,rZ11));
				Mask20							=	V_Or(Mask20,V_CmpGt(Z20,rZ20));	Mask21							=	V_Or(Mask21,V_CmpGt(Z21,rZ21));
				Mask30							=	V_Or(Mask30,V_CmpGt(Z30,rZ30));	Mask31							=	V_Or(Mask31,V_CmpGt(Z31,rZ31));
				Mask40							=	V_Or(Mask40,V_CmpGt(Z40,rZ40));	Mask41							=	V_Or(Mask41,V_CmpGt(Z41,rZ41));
				Mask50							=	V_Or(Mask50,V_CmpGt(Z50,rZ50));	Mask51							=	V_Or(Mask51,V_CmpGt(Z51,rZ51));
				Mask60							=	V_Or(Mask60,V_CmpGt(Z60,rZ60));	Mask61							=	V_Or(Mask61,V_CmpGt(Z61,rZ61));
				Mask70							=	V_Or(Mask70,V_CmpGt(Z70,rZ70));	Mask71							=	V_Or(Mask71,V_CmpGt(Z71,rZ71));
				rZ00								=	V_Select(Z00,rZ00,Mask00);			rZ01								=	V_Select(Z01,rZ01,Mask01);
				rZ10								=	V_Select(Z10,rZ10,Mask10);			rZ11								=	V_Select(Z11,rZ11,Mask11);
				rZ20								=	V_Select(Z20,rZ20,Mask20);			rZ21								=	V_Select(Z21,rZ21,Mask21);
				rZ30								=	V_Select(Z30,rZ30,Mask30);			rZ31								=	V_Select(Z31,rZ31,Mask31);
				rZ40								=	V_Select(Z40,rZ40,Mask40);			rZ41								=	V_Select(Z41,rZ41,Mask41);
				rZ50								=	V_Select(Z50,rZ50,Mask50);			rZ51								=	V_Select(Z51,rZ51,Mask51);
				rZ60								=	V_Select(Z60,rZ60,Mask60);			rZ61								=	V_Select(Z61,rZ61,Mask61);
				rZ70								=	V_Select(Z70,rZ70,Mask70);			rZ71								=	V_Select(Z71,rZ71,Mask71);
				*reinterpret_cast<qword*>(pBufferOut)			=	rZ00;
				*reinterpret_cast<qword*>(pBufferOut+4)		=	rZ01;
				*reinterpret_cast<qword*>(pBufferOut+8)		=	rZ10;
				*reinterpret_cast<qword*>(pBufferOut+12)	=	rZ11;
				*reinterpret_cast<qword*>(pBufferOut+16)	=	rZ20;
				*reinterpret_cast<qword*>(pBufferOut+20)	=	rZ21;
				*reinterpret_cast<qword*>(pBufferOut+24)	=	rZ30;
				*reinterpret_cast<qword*>(pBufferOut+28)	=	rZ31;
				*reinterpret_cast<qword*>(pBufferOut+32)	=	rZ40;
				*reinterpret_cast<qword*>(pBufferOut+36)	=	rZ41;
				*reinterpret_cast<qword*>(pBufferOut+40)	=	rZ50;
				*reinterpret_cast<qword*>(pBufferOut+44)	=	rZ51;
				*reinterpret_cast<qword*>(pBufferOut+48)	=	rZ60;
				*reinterpret_cast<qword*>(pBufferOut+52)	=	rZ61;
				*reinterpret_cast<qword*>(pBufferOut+56)	=	rZ70;
				*reinterpret_cast<qword*>(pBufferOut+60)	=	rZ71;



#if defined(DXPSR_PROFILE_DETAILED)
					asm volatile("nop");
					uint32 T1	= rdtsc();
					asm volatile("nop");
					DXPSRAS_GLOBAL_PERFCOUNTER_S0	+=	T0-T1;
					DXPSRAS_GLOBAL_PERFCOUNTER_S1++;
#endif


			}
			else
			{
				DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCacheHit);
#if defined(DXPSR_PROFILE_DETAILED)
				asm volatile("nop");
				uint32 T0	= rdtsc();
				asm volatile("nop");
#endif
				const qword xf0			=	V_Add(qX,qword0123());
				const qword xf1			=	V_Add(qX,qword4567());
				const qword xf0_31	=	V_Mul(xf0,qY31);
				const qword xf1_31	=	V_Mul(xf1,qY31);
				const qword xf0_21	=	V_Mul(xf0,qY21);
				const qword xf1_21	=	V_Mul(xf1,qY21);
				const qword	u00			=	V_NMsub(qX31,yf0,xf0_31);			const qword	u01			=	V_NMsub(qX31,yf0,xf1_31);
				const qword	u10			=	V_NMsub(qX31,yf1,xf0_31);			const qword	u11			=	V_NMsub(qX31,yf1,xf1_31);
				const qword	u20			=	V_NMsub(qX31,yf2,xf0_31);			const qword	u21			=	V_NMsub(qX31,yf2,xf1_31);
				const qword	u30			=	V_NMsub(qX31,yf3,xf0_31);			const qword	u31			=	V_NMsub(qX31,yf3,xf1_31);
				const qword	u40			=	V_NMsub(qX31,yf4,xf0_31);			const qword	u41			=	V_NMsub(qX31,yf4,xf1_31);
				const qword	u50			=	V_NMsub(qX31,yf5,xf0_31);			const qword	u51			=	V_NMsub(qX31,yf5,xf1_31);
				const qword	u60			=	V_NMsub(qX31,yf6,xf0_31);			const qword	u61			=	V_NMsub(qX31,yf6,xf1_31);
				const qword	u70			=	V_NMsub(qX31,yf7,xf0_31);			const qword	u71			=	V_NMsub(qX31,yf7,xf1_31);
				const qword	v00			=	V_Msub(qX21,yf0,xf0_21);			const qword	v01			=	V_Msub(qX21,yf0,xf1_21);
				const qword	v10			=	V_Msub(qX21,yf1,xf0_21);			const qword	v11			=	V_Msub(qX21,yf1,xf1_21);
				const qword	v20			=	V_Msub(qX21,yf2,xf0_21);			const qword	v21			=	V_Msub(qX21,yf2,xf1_21);
				const qword	v30			=	V_Msub(qX21,yf3,xf0_21);			const qword	v31			=	V_Msub(qX21,yf3,xf1_21);
				const qword	v40			=	V_Msub(qX21,yf4,xf0_21);			const qword	v41			=	V_Msub(qX21,yf4,xf1_21);
				const qword	v50			=	V_Msub(qX21,yf5,xf0_21);			const qword	v51			=	V_Msub(qX21,yf5,xf1_21);
				const qword	v60			=	V_Msub(qX21,yf6,xf0_21);			const qword	v61			=	V_Msub(qX21,yf6,xf1_21);
				const qword	v70			=	V_Msub(qX21,yf7,xf0_21);			const qword	v71			=	V_Msub(qX21,yf7,xf1_21);
				const qword Z00			=	V_Div(	V_Madd(u00,qZ21,V_Madd(v00,qZ31,qZ1)),V_Madd(u00,qW21,V_Madd(v00,qW31,qW1)));
				const qword Z01			=	V_Div(	V_Madd(u01,qZ21,V_Madd(v01,qZ31,qZ1)),V_Madd(u01,qW21,V_Madd(v01,qW31,qW1)));
				const qword Z20			=	V_Div(	V_Madd(u20,qZ21,V_Madd(v20,qZ31,qZ1)),V_Madd(u20,qW21,V_Madd(v20,qW31,qW1)));
				const qword Z21			=	V_Div(	V_Madd(u21,qZ21,V_Madd(v21,qZ31,qZ1)),V_Madd(u21,qW21,V_Madd(v21,qW31,qW1)));
				const qword Z40			=	V_Div(	V_Madd(u40,qZ21,V_Madd(v40,qZ31,qZ1)),V_Madd(u40,qW21,V_Madd(v40,qW31,qW1)));
				const qword Z41			=	V_Div(	V_Madd(u41,qZ21,V_Madd(v41,qZ31,qZ1)),V_Madd(u41,qW21,V_Madd(v41,qW31,qW1)));
				const qword Z60			=	V_Div(	V_Madd(u60,qZ21,V_Madd(v60,qZ31,qZ1)),V_Madd(u60,qW21,V_Madd(v60,qW31,qW1)));
				const qword Z61			=	V_Div(	V_Madd(u61,qZ21,V_Madd(v61,qZ31,qZ1)),V_Madd(u61,qW21,V_Madd(v61,qW31,qW1)));
				const qword Z70			=	V_Div(	V_Madd(u70,qZ21,V_Madd(v70,qZ31,qZ1)),V_Madd(u70,qW21,V_Madd(v70,qW31,qW1)));
				const qword Z71			=	V_Div(	V_Madd(u71,qZ21,V_Madd(v71,qZ31,qZ1)),V_Madd(u71,qW21,V_Madd(v71,qW31,qW1)));

	#if defined(DXPSR_FAST_ZINTERP)
				const qword qHALF		=	(qword)(vec_float4){0.5f,0.5f,0.5f,0.5f};
				const qword Z10			=	V_Madd(Z00,qHALF,V_Mul(Z20,qHALF));
				const qword Z11			=	V_Madd(Z01,qHALF,V_Mul(Z21,qHALF));
				const qword Z30			=	V_Madd(Z20,qHALF,V_Mul(Z40,qHALF));
				const qword Z31			=	V_Madd(Z21,qHALF,V_Mul(Z41,qHALF));
				const qword Z50			=	V_Madd(Z40,qHALF,V_Mul(Z60,qHALF));
				const qword Z51			=	V_Madd(Z41,qHALF,V_Mul(Z61,qHALF));
	#else
				const qword Z10			=	V_Div(	V_Madd(u10,qZ21,V_Madd(v10,qZ31,qZ1)),V_Madd(u10,qW21,V_Madd(v10,qW31,qW1)));
				const qword Z11			=	V_Div(	V_Madd(u11,qZ21,V_Madd(v11,qZ31,qZ1)),V_Madd(u11,qW21,V_Madd(v11,qW31,qW1)));
				const qword Z30			=	V_Div(	V_Madd(u30,qZ21,V_Madd(v30,qZ31,qZ1)),V_Madd(u30,qW21,V_Madd(v30,qW31,qW1)));
				const qword Z31			=	V_Div(	V_Madd(u31,qZ21,V_Madd(v31,qZ31,qZ1)),V_Madd(u31,qW21,V_Madd(v31,qW31,qW1)));
				const qword Z50			=	V_Div(	V_Madd(u50,qZ21,V_Madd(v50,qZ31,qZ1)),V_Madd(u50,qW21,V_Madd(v50,qW31,qW1)));
				const qword Z51			=	V_Div(	V_Madd(u51,qZ21,V_Madd(v51,qZ31,qZ1)),V_Madd(u51,qW21,V_Madd(v51,qW31,qW1)));
	#endif
				DXPSR_PROFILE_ZBCACHE(g_Stats.m_ZBCoarseFineTest);
				//fine test needed
				//u<0?	(v>-inf?-inf:0) :			->0xffffffff
				//			(v>   0?-inf:0)
				qword	Mask00	=	V_CmpGt(qDet,V_Add(u00,v00));			qword	Mask01	=	V_CmpGt(qDet,V_Add(u01,v01));
				qword	Mask10	=	V_CmpGt(qDet,V_Add(u10,v10));			qword	Mask11	=	V_CmpGt(qDet,V_Add(u11,v11));
				qword	Mask20	=	V_CmpGt(qDet,V_Add(u20,v20));			qword	Mask21	=	V_CmpGt(qDet,V_Add(u21,v21));
				qword	Mask30	=	V_CmpGt(qDet,V_Add(u30,v30));			qword	Mask31	=	V_CmpGt(qDet,V_Add(u31,v31));
				qword	Mask40	=	V_CmpGt(qDet,V_Add(u40,v40));			qword	Mask41	=	V_CmpGt(qDet,V_Add(u41,v41));
				qword	Mask50	=	V_CmpGt(qDet,V_Add(u50,v50));			qword	Mask51	=	V_CmpGt(qDet,V_Add(u51,v51));
				qword	Mask60	=	V_CmpGt(qDet,V_Add(u60,v60));			qword	Mask61	=	V_CmpGt(qDet,V_Add(u61,v61));
				qword	Mask70	=	V_CmpGt(qDet,V_Add(u70,v70));			qword	Mask71	=	V_CmpGt(qDet,V_Add(u71,v71));
				Mask00	=	V_CmpGt(u00,Mask00);			Mask01	=	V_CmpGt(u01,Mask01);
				Mask10	=	V_CmpGt(u10,Mask10);			Mask11	=	V_CmpGt(u11,Mask11);
				Mask20	=	V_CmpGt(u20,Mask20);			Mask21	=	V_CmpGt(u21,Mask21);
				Mask30	=	V_CmpGt(u30,Mask30);			Mask31	=	V_CmpGt(u31,Mask31);
				Mask40	=	V_CmpGt(u40,Mask40);			Mask41	=	V_CmpGt(u41,Mask41);
				Mask50	=	V_CmpGt(u50,Mask50);			Mask51	=	V_CmpGt(u51,Mask51);
				Mask60	=	V_CmpGt(u60,Mask60);			Mask61	=	V_CmpGt(u61,Mask61);
				Mask70	=	V_CmpGt(u70,Mask70);			Mask71	=	V_CmpGt(u71,Mask71);
				Mask00	=	V_CmpGt(v00,Mask00);			Mask01	=	V_CmpGt(v01,Mask01);
				Mask10	=	V_CmpGt(v10,Mask10);			Mask11	=	V_CmpGt(v11,Mask11);
				Mask20	=	V_CmpGt(v20,Mask20);			Mask21	=	V_CmpGt(v21,Mask21);
				Mask30	=	V_CmpGt(v30,Mask30);			Mask31	=	V_CmpGt(v31,Mask31);
				Mask40	=	V_CmpGt(v40,Mask40);			Mask41	=	V_CmpGt(v41,Mask41);
				Mask50	=	V_CmpGt(v50,Mask50);			Mask51	=	V_CmpGt(v51,Mask51);
				Mask60	=	V_CmpGt(v60,Mask60);			Mask61	=	V_CmpGt(v61,Mask61);
				Mask70	=	V_CmpGt(v70,Mask70);			Mask71	=	V_CmpGt(v71,Mask71);
				qword rZ00					=	*reinterpret_cast<qword*>(pBufferIn);
				qword rZ01					=	*reinterpret_cast<qword*>(pBufferIn+4);
				qword rZ10					=	*reinterpret_cast<qword*>(pBufferIn+8);
				qword rZ11					=	*reinterpret_cast<qword*>(pBufferIn+12);
				qword rZ20					=	*reinterpret_cast<qword*>(pBufferIn+16);
				qword rZ21					=	*reinterpret_cast<qword*>(pBufferIn+20);
				qword rZ30					=	*reinterpret_cast<qword*>(pBufferIn+24);
				qword rZ31					=	*reinterpret_cast<qword*>(pBufferIn+28);
				qword rZ40					=	*reinterpret_cast<qword*>(pBufferIn+32);
				qword rZ41					=	*reinterpret_cast<qword*>(pBufferIn+36);
				qword rZ50					=	*reinterpret_cast<qword*>(pBufferIn+40);
				qword rZ51					=	*reinterpret_cast<qword*>(pBufferIn+44);
				qword rZ60					=	*reinterpret_cast<qword*>(pBufferIn+48);
				qword rZ61					=	*reinterpret_cast<qword*>(pBufferIn+52);
				qword rZ70					=	*reinterpret_cast<qword*>(pBufferIn+56);
				qword rZ71					=	*reinterpret_cast<qword*>(pBufferIn+60);
#define	V_OrOdd(X,Y) V_Or(X,Y)
//#define	V_OrOdd(X,Y) si_shufb(Y,Y,*reinterpret_cast<qword*>(&reinterpret_cast<uint8*>(qOrSelect)[si_to_int(si_rotqbii(si_gb(X),4))]))
//#define	V_OrOdd(X,Y) si_shufb(Y,Y,qOrSelect[si_to_int(si_gb(X))])
				Mask00							=	V_OrOdd(Mask00,V_CmpGt(Z00,rZ00));	Mask01							=	V_OrOdd(Mask01,V_CmpGt(Z01,rZ01));
				Mask10							=	V_OrOdd(Mask10,V_CmpGt(Z10,rZ10));	Mask11							=	V_OrOdd(Mask11,V_CmpGt(Z11,rZ11));
				Mask20							=	V_OrOdd(Mask20,V_CmpGt(Z20,rZ20));	Mask21							=	V_OrOdd(Mask21,V_CmpGt(Z21,rZ21));
				Mask30							=	V_OrOdd(Mask30,V_CmpGt(Z30,rZ30));	Mask31							=	V_OrOdd(Mask31,V_CmpGt(Z31,rZ31));
				Mask40							=	V_OrOdd(Mask40,V_CmpGt(Z40,rZ40));	Mask41							=	V_OrOdd(Mask41,V_CmpGt(Z41,rZ41));
				Mask50							=	V_OrOdd(Mask50,V_CmpGt(Z50,rZ50));	Mask51							=	V_OrOdd(Mask51,V_CmpGt(Z51,rZ51));
				Mask60							=	V_OrOdd(Mask60,V_CmpGt(Z60,rZ60));	Mask61							=	V_OrOdd(Mask61,V_CmpGt(Z61,rZ61));
				Mask70							=	V_OrOdd(Mask70,V_CmpGt(Z70,rZ70));	Mask71							=	V_OrOdd(Mask71,V_CmpGt(Z71,rZ71));
				rZ00								=	V_Select(Z00,rZ00,Mask00);					rZ01								=	V_Select(Z01,rZ01,Mask01);
				rZ10								=	V_Select(Z10,rZ10,Mask10);					rZ11								=	V_Select(Z11,rZ11,Mask11);
				rZ20								=	V_Select(Z20,rZ20,Mask20);					rZ21								=	V_Select(Z21,rZ21,Mask21);
				rZ30								=	V_Select(Z30,rZ30,Mask30);					rZ31								=	V_Select(Z31,rZ31,Mask31);
				rZ40								=	V_Select(Z40,rZ40,Mask40);					rZ41								=	V_Select(Z41,rZ41,Mask41);
				rZ50								=	V_Select(Z50,rZ50,Mask50);					rZ51								=	V_Select(Z51,rZ51,Mask51);
				rZ60								=	V_Select(Z60,rZ60,Mask60);					rZ61								=	V_Select(Z61,rZ61,Mask61);
				rZ70								=	V_Select(Z70,rZ70,Mask70);					rZ71								=	V_Select(Z71,rZ71,Mask71);
				*reinterpret_cast<qword*>(pBufferOut)			=	rZ00;
				*reinterpret_cast<qword*>(pBufferOut+4)		=	rZ01;
				*reinterpret_cast<qword*>(pBufferOut+8)		=	rZ10;
				*reinterpret_cast<qword*>(pBufferOut+12)	=	rZ11;
				*reinterpret_cast<qword*>(pBufferOut+16)	=	rZ20;
				*reinterpret_cast<qword*>(pBufferOut+20)	=	rZ21;
				*reinterpret_cast<qword*>(pBufferOut+24)	=	rZ30;
				*reinterpret_cast<qword*>(pBufferOut+28)	=	rZ31;
				*reinterpret_cast<qword*>(pBufferOut+32)	=	rZ40;
				*reinterpret_cast<qword*>(pBufferOut+36)	=	rZ41;
				*reinterpret_cast<qword*>(pBufferOut+40)	=	rZ50;
				*reinterpret_cast<qword*>(pBufferOut+44)	=	rZ51;
				*reinterpret_cast<qword*>(pBufferOut+48)	=	rZ60;
				*reinterpret_cast<qword*>(pBufferOut+52)	=	rZ61;
				*reinterpret_cast<qword*>(pBufferOut+56)	=	rZ70;
				*reinterpret_cast<qword*>(pBufferOut+60)	=	rZ71;
#if defined(DXPSR_PROFILE_DETAILED)
				asm volatile("nop");
				uint32 T1	= rdtsc();
				asm volatile("nop");
				DXPSRAS_GLOBAL_PERFCOUNTER_F0	+=	T0-T1;
				DXPSRAS_GLOBAL_PERFCOUNTER_F1++;
#endif
			}



		}
	}
#if defined(DXPSR_PROFILE_DETAILED)
	asm volatile("nop");
	uint32 T1	= rdtsc();
	asm volatile("nop");
	DXPSRAS_GLOBAL_PERFCOUNTER_W0	+=	T0-T1;
	DXPSRAS_GLOBAL_PERFCOUNTER_W1++;
#endif
}

void ClearZBuffer(CDXPSRJob& rJob)
{
#ifdef DXPSR_PROFILE_TIMINGS
	g_Stats.m_TimeClear							=
	g_Stats.m_TimeRasVertex					=
	g_Stats.m_TimeRasTriangle				=
	g_Stats.m_TimeFlush							=
	g_Stats.m_CountDrawcalls				=
	g_Stats.m_CountTriangles				=
	g_Stats.m_ZBCacheMiss						=
	g_Stats.m_ZBCacheHit						=
	g_Stats.m_ZBStall								=
	g_Stats.m_VBCacheHit						=
	g_Stats.m_VBCacheMiss						=	
	g_Stats.m_ZBCoarseFail					=
	g_Stats.m_ZBCoarsePass					=
	g_Stats.m_ZBCoarseFineTest			=
	g_Stats.m_VBVertexAssembly			=
	g_Stats.m_VBVertexAssemblyStall	=
	g_Stats.m_ZCUpdate							=
	g_Stats.m_ZCCulled							=
	g_Stats.m_ZCPassed							=0;


	asm volatile("nop");
	uint32 T0	= rdtsc();
	asm volatile("nop");

#endif
		const	qword ClearValue	=	(qword)(vec_float4){1.f,1.f,1.f,1.f};
		qword* __restrict	pClearBuffer			=	reinterpret_cast<qword*>(DXPSRAS_Buffer);
		for(uint32 a=0;a<DXPSRAS_BlockSize;a+=16)
		{
			pClearBuffer[a]			=	
			pClearBuffer[a+0x1]	=	
			pClearBuffer[a+0x2]	=	
			pClearBuffer[a+0x3]	=	
			pClearBuffer[a+0x4]	=	
			pClearBuffer[a+0x5]	=	
			pClearBuffer[a+0x6]	=	
			pClearBuffer[a+0x7]	=	
			pClearBuffer[a+0x8]	=	
			pClearBuffer[a+0x9]	=	
			pClearBuffer[a+0xA]	=	
			pClearBuffer[a+0xB]	=	
			pClearBuffer[a+0xC]	=	
			pClearBuffer[a+0xD]	=	
			pClearBuffer[a+0xE]	=	
			pClearBuffer[a+0xF]	=		ClearValue;
		}

		const uint32 ClearSize	=	(rJob.ViewPortSizeX()*rJob.ViewPortSizeY()*sizeof(float)+DXPSRAS_BlockSizeByte-1)/DXPSRAS_BlockSizeByte;
		uint8* pBuffer	=	reinterpret_cast<uint8*>(g_pZBuffer);
		for(uint32 a=0;a<ClearSize;a++,pBuffer+=DXPSRAS_BlockSizeByte)
			memtransfer_to_main(pBuffer,DXPSRAS_Buffer,DXPSRAS_BlockSizeByte,0);


		memtransfer_sync_id(0);
#ifdef DXPSR_PROFILE_TIMINGS
		asm volatile("nop");
		uint32 T1	= rdtsc();
		asm volatile("nop");
		g_Stats.m_TimeClear	+=	T0-T1;
#endif
}

void CopyZBuffer(CDXPSRJob& rJob)
{
	//ZBCacheFlush
	{
		SPU_DOMAIN_LOCAL tdDXPSRDepth*	DXPSRAS_ZB_pCache	=	SPU_LOCAL_PTR(reinterpret_cast<tdDXPSRDepth*>(&DXPSRAS_Buffer[DXPSRAS_ZB_Offset]));
		for(uint32 a=0;a<DXPSRAS_ZB_BlockCount;a++)
		{
			SPU_DOMAIN_LOCAL tdDXPSRDepth* pCache	=	SPU_LOCAL_PTR(&DXPSRAS_ZB_pCache[a*DXPSRAS_ZB_BlockSize]);
			IF(DXPSRAS_ZB_CacheEntry[a]!=(uint16)~0u,1)
				memtransfer_to_main(SPU_MAIN_PTR(g_pZBuffer+((uint32)DXPSRAS_ZB_CacheEntry[a])*DXPSRAS_ZB_BlockSize),pCache,DXPSRAS_ZB_BlockSizeByte,0);
		}
	}
#ifdef DXPSR_PROFILE_TIMINGS
		asm volatile("nop");
		uint32 T0	= rdtsc();
		asm volatile("nop");
#endif
		const uint32 ConvertSize		=	(rJob.ViewPortSizeX()*rJob.ViewPortSizeY()*sizeof(float)+DXPSRAS_BlockSizeByte-1)/DXPSRAS_BlockSizeByte;

		uint8*	pBufferSrc	=	reinterpret_cast<uint8*>(g_pZBuffer);
		uint8*	pBufferDst	=	reinterpret_cast<uint8*>(rJob.DepthBuffer());

#if defined(DXPSR_PROFILE_TIMINGS) || !defined(BYPASS_CACHE)
//		__spu_flush_cache_range(reinterpret_cast<uint32>(pBufferSrc), reinterpret_cast<uint32>(&pBufferSrc[ConvertSize]));
		__spu_flush_cache();
#endif
		
		memtransfer_from_main_fenced(DXPSRAS_Buffer,									pBufferSrc,DXPSRAS_BlockSizeByte,0);pBufferSrc+=DXPSRAS_BlockSizeByte;
		memtransfer_from_main(DXPSRAS_Buffer+DXPSRAS_BlockSizeByte,		pBufferSrc,DXPSRAS_BlockSizeByte,1);pBufferSrc+=DXPSRAS_BlockSizeByte;
		memtransfer_from_main(DXPSRAS_Buffer+DXPSRAS_BlockSizeByte*2,	pBufferSrc,DXPSRAS_BlockSizeByte,2);pBufferSrc+=DXPSRAS_BlockSizeByte;
		memtransfer_from_main(DXPSRAS_Buffer+DXPSRAS_BlockSizeByte*3,	pBufferSrc,DXPSRAS_BlockSizeByte,3);pBufferSrc+=DXPSRAS_BlockSizeByte;

		const qword qSELECT	=	{0x00,0x01,0x02,0x80,0x04,0x05,0x06,0x80,0x08,0x09,0x0A,0x80,0x0C,0x0D,0x0E,0x80};

		for(uint32 a=0;a<ConvertSize;a++)
		{
			//Sync dma to this Block
			const uint32 BufferID	=	a&3;
			qword*  __restrict pBufferTmpIn	=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_BlockSizeByte*BufferID);
			qword*  __restrict pBufferTmpOut=	reinterpret_cast<qword*>(DXPSRAS_Buffer+DXPSRAS_BlockSizeByte*(BufferID+4));
			memtransfer_sync_id(BufferID);

			for(uint32 xIn=0,xOut=0;xIn<DXPSRAS_BlockSize;xIn+=16,xOut+=2)
			{
				qword	R0	=	pBufferTmpIn[xIn+0x0];
				qword	R1	=	pBufferTmpIn[xIn+0x1];
				qword	R2	=	pBufferTmpIn[xIn+0x2];
				qword	R3	=	pBufferTmpIn[xIn+0x3];
				qword	R4	=	pBufferTmpIn[xIn+0x4];
				qword	R5	=	pBufferTmpIn[xIn+0x5];
				qword	R6	=	pBufferTmpIn[xIn+0x6];
				qword	R7	=	pBufferTmpIn[xIn+0x7];
				qword	R8	=	pBufferTmpIn[xIn+0x8];
				qword	R9	=	pBufferTmpIn[xIn+0x9];
				qword	RA	=	pBufferTmpIn[xIn+0xA];
				qword	RB	=	pBufferTmpIn[xIn+0xB];
				qword	RC	=	pBufferTmpIn[xIn+0xC];
				qword	RD	=	pBufferTmpIn[xIn+0xD];
				qword	RE	=	pBufferTmpIn[xIn+0xE];
				qword	RF	=	pBufferTmpIn[xIn+0xF];
				R0	=	V_F32ToU32Sat(R0);//mul by range, clamp to 0.f-1.f, convert to uint32
				R1	=	V_F32ToU32Sat(R1);
				R2	=	V_F32ToU32Sat(R2);
				R3	=	V_F32ToU32Sat(R3);
				R4	=	V_F32ToU32Sat(R4);
				R5	=	V_F32ToU32Sat(R5);
				R6	=	V_F32ToU32Sat(R6);
				R7	=	V_F32ToU32Sat(R7);
				R8	=	V_F32ToU32Sat(R8);
				R9	=	V_F32ToU32Sat(R9);
				RA	=	V_F32ToU32Sat(RA);
				RB	=	V_F32ToU32Sat(RB);
				RC	=	V_F32ToU32Sat(RC);
				RD	=	V_F32ToU32Sat(RD);
				RE	=	V_F32ToU32Sat(RE);
				RF	=	V_F32ToU32Sat(RF);
				R0	=	V_Swizzle(R0,qSELECT);	//shuffle to
				R1	=	V_Swizzle(R1,qSELECT);
				R2	=	V_Swizzle(R2,qSELECT);
				R3	=	V_Swizzle(R3,qSELECT);
				R4	=	V_Swizzle(R4,qSELECT);
				R5	=	V_Swizzle(R5,qSELECT);
				R6	=	V_Swizzle(R6,qSELECT);
				R7	=	V_Swizzle(R7,qSELECT);
				R8	=	V_Swizzle(R8,qSELECT);
				R9	=	V_Swizzle(R9,qSELECT);
				RA	=	V_Swizzle(RA,qSELECT);
				RB	=	V_Swizzle(RB,qSELECT);
				RC	=	V_Swizzle(RC,qSELECT);
				RD	=	V_Swizzle(RD,qSELECT);
				RE	=	V_Swizzle(RE,qSELECT);
				RF	=	V_Swizzle(RF,qSELECT);
				pBufferTmpOut[xOut+0x0]																	=	R0;	//unswizzle
				pBufferTmpOut[xOut+0x1]																	=	R1;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4]		=	R2;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4]		=	R3;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*2]	=	R4;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*2]	=	R5;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*3]	=	R6;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*3]	=	R7;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*4]	=	R8;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*4]	=	R9;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*5]	=	RA;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*5]	=	RB;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*6]	=	RC;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*6]	=	RD;
				pBufferTmpOut[xOut+0x0+DXPS_RASTERIZER_RESOLUTION/4*7]	=	RE;
				pBufferTmpOut[xOut+0x1+DXPS_RASTERIZER_RESOLUTION/4*7]	=	RF;
			}
			//prefetch a+3 Block
			IF(a+3<ConvertSize,1)
			{
				memtransfer_from_main(pBufferTmpIn,pBufferSrc,DXPSRAS_BlockSizeByte,BufferID);
				pBufferSrc+=DXPSRAS_BlockSizeByte;
			}
			memtransfer_to_main(pBufferDst,pBufferTmpOut,DXPSRAS_BlockSizeByte,BufferID);
			pBufferDst+=DXPSRAS_BlockSizeByte;
		}
		memtransfer_sync_id((ConvertSize-1)&3);
#ifdef DXPSR_PROFILE_TIMINGS
		asm volatile("nop");
		uint32 T1	= rdtsc();
		asm volatile("nop");
		g_Stats.m_TimeFlush	+=	T0-T1;

#if defined(DXPSR_PROFILE_VB)
		printf("DXPSRasterizer timming: %d %d %d %d = %d Tri:%d DP:%d VBC:%d/%d VBA:%d/%d\n",g_Stats.m_TimeClear,
																																													g_Stats.m_TimeRasVertex,
																																													g_Stats.m_TimeRasTriangle,
																																													g_Stats.m_TimeFlush,
																																													g_Stats.m_TimeClear+g_Stats.m_TimeRasVertex+g_Stats.m_TimeRasTriangle+g_Stats.m_TimeFlush,
																																													g_Stats.m_CountTriangles/3,g_Stats.m_CountDrawcalls,
																																													g_Stats.m_VBCacheHit,
																																													g_Stats.m_VBCacheMiss,
																																													g_Stats.m_VBVertexAssemblyStall,
																																													g_Stats.m_VBVertexAssembly);
#else
		printf("DXPSRasterizer timming: %d %d %d %d = %d ZB:%d/%d/%d Coarse:%d/%d/%d\n",g_Stats.m_TimeClear,
																																										g_Stats.m_TimeRasVertex,
																																										g_Stats.m_TimeRasTriangle,
																																										g_Stats.m_TimeFlush,
																																										g_Stats.m_TimeClear+g_Stats.m_TimeRasVertex+g_Stats.m_TimeRasTriangle+g_Stats.m_TimeFlush,
																																										g_Stats.m_ZBCacheHit,
																																										g_Stats.m_ZBCacheMiss,
																																										g_Stats.m_ZBStall,
																																										g_Stats.m_ZBCoarseFail,
																																										g_Stats.m_ZBCoarsePass,
																																										g_Stats.m_ZBCoarseFineTest);
#endif
#endif
		//artrium
		//SPU1: DXPSRasterizer timming: 811415 9098525 115360748 1390944 = 126661632	1.5833s
		//SPU1: DXPSRasterizer timming:   3966 9104411 114083745 1384181 = 124576303	1.5572s //dma-async-clear, 19. GB/s
		//SPU1: DXPSRasterizer timming:	  4034 9165727 112305036	 25701 = 121500498	1.5188s //pipelined float2uint + endianswap from mainmem2rsxmem	2*3GB/s
		//SPU1: DXPSRasterizer timming:   4055 7180192 110539954   26397 = 117750598  1.4719s	//dma-async-transfer for indices
		//SPU1: DXPSRasterizer timming:   4091 1749387 105984659   25851 = 107763988					//dma-async-transfer for indices and cache system, cachehits about 700k, misses 3k, perframe
		//SPU1: DXPSRasterizer timming:   3984 1430065 140649597   25792 = 142109438					//unrolling loops
		//SPU1: DXPSRasterizer timming:   4126 1239481  18498703   25796 =  19768106  0.2471s
		//SPU1: DXPSRasterizer timming:   4012 4385190   8644346   25794 =  13059342  0.1632s	
		//SPU1: DXPSRasterizer timming:   4169 2290072  36095627   25631 =  38415499  0.4802s 28.6ms for vertices
		//SPU1: DXPSRasterizer timming:   4146 2521997  28621508   25646 =  31173297  0.3896s	//inverse matrix by hand
		//SPU1: DXPSRasterizer timming:   4134 2507150 153730213   25625 = 156267122	1.9533s
		//SPU1: DXPSRasterizer timming:   4176 4395429  71296080   25649 =  75721334					//ZBuffer cache
		//SPU1: DXPSRasterizer timming:		4414 3701091	48502432	 25629 =  52233566					//cache optimizations
		//SPU1: DXPSRasterizer timming:   3901 3662944  45958254   25670 =  49650769	0.6206s	//cache optimizations
		//SPU1: DXPSRasterizer timming:   4215 4388985  27502065   25652 =  31920917
		//SPU1: DXPSRasterizer timming:   4287 4381323  18879703   25656 =  23290969
		//SPU1: DXPSRasterizer timming:   4122 4389746  19502319   25652 =  23921839	0.2990s	//vectorized pixel evaluation during rasterization
		//SPU1: DXPSRasterizer timming:   4134 3692467  12868425   25649 =  16590675  0.2074s	//unrolling rasterization in x
		//SPU1: DXPSRasterizer timming:   4133 3623826  11270244   25655 =  14923858	0.1865s	//unrolling rasterization in y
		//SPU1: DXPSRasterizer timming:   4834 3542745   9667679   25418 =  13240676  0.1655s	//delayed ZBuffer sync
		//SPU1: DXPSRasterizer timming:   4097 1262755   4907831   25371 =   6200054	0.07750	//coarse rasterizer
		//SPU1: DXPSRasterizer timming:   4319 1256770   4633181   25426 =   5919696	0.07400	//vertexcache size from 4*2kb -> 16*2kb
		//SPU1: DXPSRasterizer timming:   4292 1045158   4516122   25364 =   5590936	0.07156	//
		//SPU1: DXPSRasterizer timming:   4263 1022005   4498324   25494 =   5550086	0.06938	//
		//SPU1: DXPSRasterizer timming:   4286 1021891   4516990   25607 =   5568774
		//SPU1: DXPSRasterizer timming:   4289 1021444   4347154   25421 =   5398308	0.06748	//without trivial accept for quads, triangles are too tiny to be effective
		//SPU1: DXPSRasterizer timming:   5016 1010093   4307827   25354 =   5348290  0.06685	//
		//SPU1: DXPSRasterizer timming:   4937  990141   4181639   25358 =   5202075	0.0650s	//cached vertices transformed by inverse of last mat * current_mat
		//SPU1: DXPSRasterizer timming:   4910  991676   4119556   25381 =   5141523	0.0643s	//removed *this cache lookups by making memberfunctions globalfunctions
		//SPU1: DXPSRasterizer timming:   4980 1000865   4039150   25454 =   5070449	0.0634s	//by hand instruction shedulling
		//SPU1: DXPSRasterizer timming:   5215  999933   4023992   25383 =   5054523	0.0632s	//
		//SPU1: DXPSRasterizer timming:   4989  999357   3863998   25374 =   4893718	0.0612s	//merged compareGT and AND
		//SPU1: DXPSRasterizer timming:   4961  604702   4050173   25340 =   4685176  0.0586s	//doubled post transform cache size -> 3x less slow-path transforms
		//SPU1: DXPSRasterizer timming:   4952  650878   3390132   25365 =	 4071327	0.0509s	//zero area triangle and zero pixel triangle culling
		//SPU1: DXPSRasterizer timming:   4932  200660   3480089   25378 =   3711059	0.0464s	//doubling Vertex cache, reducing ZBuffer
		//SPU1: DXPSRasterizer timming:   4920  196277   3441691   25368 =   3668256	0.0459s	//uint16 ZB-cache IDs
		//SPU1: DXPSRasterizer timming:   4969  193655   3228077   25348 =   3452049	0.0432s	//zbuffer prefetch

			//Crysis area [shift+f6]
		//SPU1: DXPSRasterizer timming:		4811	285520	 4864743   25390 =   5180464	0.0648s	//
		//SPU1: DXPSRasterizer timming:   4951  192310   2112710   26679 =   2336650	0.0292s	//[Tri:280994 DP:569 9,6MTri, transform:116MTri] cached vertices transformed by inverse of last mat * current_mat
		//SPU1: DXPSRasterizer timming:   4793  189318   2023511   34425 =   2252047	0.0282s	//[9.98MTri/s] optimized backface culling
		//SPU1: DXPSRasterizer timming:   4799  192155   1981551   31735 =   2210240	0.0276s	//[10.2MTri/s] by hand instruction shedulling
		//SPU1: DXPSRasterizer timming:   4793  191407   1877089   34025 =   2107314	0.0263s
		//SPU1: DXPSRasterizer timming:   4743  148640   1923937   34060 =   2111380	0.0264s	//[transform: 151MTris] doubled post transform cache size -> 2x less slow-path transforms
		//SPU1: DXPSRasterizer timming:   4816  150214   1958283   34400 =   2147713	0.0268s	//zcull test
		//SPU1: DXPSRasterizer timming:   4831  145172   3879991   25375 =   4055369	0.0507s	//fixed post transform cache that was eating trianglse
		//SPU1: DXPSRasterizer timming:   4800  153020   3525531   29312 =   3712663	0.0464s	//zero area triangle and zero pixel triangle culling
		//SPU1: DXPSRasterizer timming:   4880  114528   3629536   25484 =	 3774428						//doubling Vertex cache, reducing ZBuffer
		//SPU1: DXPSRasterizer timming:   4860  112161   3628649   29190 =   3774860
		//SPU1: DXPSRasterizer timming:		4905	110997	 3615992   28804 =	 3760698					//zbuffer prefetch

		//urban area
		//SPU1: DXPSRasterizer timming:   5075 2328889  13678792   25368 =  16038124	0.2005s
		//SPU1: DXPSRasterizer timming:   4785 2021575  13323713   25391 =  15375464	0.1922s	//optimized backface culling
		//SPU1: DXPSRasterizer timming:   4914 2313237  13040090   25355 =  15383596	0.1923s	//merged compareGT and AND
		//SPU1: DXPSRasterizer timming:   5033 1444472  13102029   25436 =  14576970	0.1822s	//doubled post transform cache size -> 3x less slow-path transforms
		//SPU1: DXPSRasterizer timming:   4977 1559753  12316129   25332 =  13906191	0.1738s	//zero area triangle and zero pixel triangle culling
		//SPU1: DXPSRasterizer timming:   5038  968430  12765725   25360 =  13764553	0.1721s		//doubling Vertex cache, reducing ZBuffer
		//SPU1: DXPSRasterizer timming:   4860  969798  12786301   25364 =  13786323
		//SPU1: DXPSRasterizer timming:		5025	966008	11785221	 25355 =	12781609	0.1598s	//zbuffer prefetch

			//crysis 2
		//SPU1: DXPSRasterizer timming:   4999 1135695  16668551   25380 =  17834625	0,2229s	//Tri:353723 DP:288 ZC:205691 5864/430261
		//SPU1: DXPSRasterizer timming:   5016 1146850  10058777   25361 =  11236004	0.1405s	//disabled zcull
		//SPU1: DXPSRasterizer timming:   5045 1176269   9467729   25366 =  10674409	0.1334s	//fast z-interpolation
		//SPU1: DXPSRasterizer timming:   5061 1102033   9283416   25378 =  10415888  0.1302s	//tweajed z interpolation
		//SPU2: DXPSRasterizer timming:   4968  765516   8260709   25365 =   9056558	0.1132s //Tri:446655 DP:266 VBC:0/0 VBA:0/0

//16kb post transform cache
//artrium:SPU1: DXPSRasterizer timming: 4578 988141 397628 25356 = 1415703 Tri:250805 DP:147 VBC:595302/2283 VBA:117479/210922
//artrium:SPU1: Fast/Middle/Slow:22/61/64
//crysis:SPU1: DXPSRasterizer timming: 4794 194029 381625 36574 = 617022 Tri:280994 DP:569 VBC:58233/249 VBA:30872/58580
//crysis:SPU1: Fast/Middle/Slow:468/94/7
//urban:SPU1: DXPSRasterizer timming: 4883 2316674 959360 25386 = 3306303 Tri:611957 DP:435 VBC:1350715/5588 VBA:290579/542524
//urban:SPU1: Fast/Middle/Slow:76/220/139

//32kb post transform cache
//artrium:SPU1: DXPSRasterizer timming: 4714 644807 379446 32910 = 1061877 Tri:257791 DP:154 VBC:762589/1139 VBA:111998/207389
//artrium:SPU1: Fast/Middle/Slow:26/106/22
//crysis:SPU1: DXPSRasterizer timming: 4866 154617 378438 32869 = 570790 Tri:280994 DP:569 VBC:824427/141 VBA:31245/59060
//crysis:SPU1: Fast/Middle/Slow:467/99/3
//urban:SPU1: DXPSRasterizer timming: 4610 1550465 899592 34804 = 2489471 Tri:611957 DP:435 VBC:1809511/3365 VBA:277128/523482
//urban:SPU1: Fast/Middle/Slow:90/291/54

}

RILNE uint32 CDXPSRDRasterizer::NextDCByVtxPos(uint32& dcVtxPosCur, uint32& dcVtxPosUnused, uint32 dcVtxPosEnd)
{
	//find next drawcall matching vertex position, if none left, use first unused one
	//return endpos if 
	const void* cpCurVtxPos = DXPSRAS_DCBuffer[dcVtxPosCur];
	DXPSRAS_DCBuffer[dcVtxPosCur] = NULL;//mark used
	uint32 curCheckPos = NextDrawcallID(dcVtxPosCur);
	while(DXPSRAS_DCBuffer[curCheckPos] != cpCurVtxPos && curCheckPos != dcVtxPosEnd)
		curCheckPos = NextDrawcallID(curCheckPos);
	IF(curCheckPos != dcVtxPosEnd, 1)
	{
		dcVtxPosCur			= curCheckPos;
		IF(curCheckPos != dcVtxPosUnused,1)
			return curCheckPos;
	}
	else
	{
		IF(dcVtxPosUnused == dcVtxPosEnd, 0)
			return dcVtxPosEnd;
		dcVtxPosCur = dcVtxPosUnused;
	}
	curCheckPos = NextDrawcallID(dcVtxPosUnused);
	while(DXPSRAS_DCBuffer[curCheckPos] == NULL)//skip used ones
	{
		curCheckPos = NextDrawcallID(curCheckPos);
		if(curCheckPos == dcVtxPosEnd)//none left
			return dcVtxPosEnd;
	}
	dcVtxPosUnused = curCheckPos;
	return dcVtxPosCur;
}

#if !defined(CRYCG_CM)
SPU_ENTRY(DXPSRasterize)
#endif
void CDXPSRDRasterizer::Draw(SPU_DOMAIN_MAIN CDXPSRJob* pJob2)
{
	g_pZBuffer	=	m_Buffer;

	CDXPSRJob rJob _ALIGN(128);
#if defined(BYPASS_CACHE)
	memtransfer_from_main(DXPSRAS_DCBuffer, m_DrawCallsVtxPos, sizeof(m_DrawCallsVtxPos), 0);
	memtransfer_from_main(&rJob,SPU_MAIN_PTR(pJob2),sizeof(CDXPSRJob),0);
#else
	memcpy(&rJob,SPU_MAIN_PTR(pJob2),sizeof(CDXPSRJob));
#endif

#ifdef DXPSR_PROFILE_TIMINGS
	memcpy(&g_Stats,&m_Stats,sizeof(SDXPSRasStats));
#endif

	ZBCacheClear();

#if defined(DXPSR_PROFILE_DETAILED)
	DXPSRAS_GLOBAL_PERFCOUNTER_F0	=
	DXPSRAS_GLOBAL_PERFCOUNTER_F1	=
	DXPSRAS_GLOBAL_PERFCOUNTER_C0	=
	DXPSRAS_GLOBAL_PERFCOUNTER_C1	=
	DXPSRAS_GLOBAL_PERFCOUNTER_S0	=
	DXPSRAS_GLOBAL_PERFCOUNTER_S1	=
	DXPSRAS_GLOBAL_PERFCOUNTER_W0	=
	DXPSRAS_GLOBAL_PERFCOUNTER_W1	=	0;
#endif
#if defined(BYPASS_CACHE)
	memtransfer_sync_id(0);
#endif
	uint32 a										= rJob.DrawcallStart();
	const uint32 drawCallEnd		= rJob.DrawcallEnd();
#if defined(BYPASS_CACHE)	
	const uint32 dcVtxPosEnd		= drawCallEnd;
	uint32 dcVtxPosCur					= a;
	uint32 dcVtxPosUnused				= NextDrawcallID(a);

	CDXPSRDrawCall DrawCallBuf[2];
	uint32 curDrawCallBuf = 0;
	memtransfer_from_main(DrawCallBuf,&m_DrawCalls[a],sizeof(CDXPSRDrawCall),curDrawCallBuf);
#endif
	ClearZBuffer(rJob);
#if defined(BYPASS_CACHE)	
	memtransfer_sync_id(curDrawCallBuf);
#endif
	const void* pBuffer=0;
//	uint32 Hit=0;
	Matrix44A ViewMat;
	int Fast=0,Middle=0,Slow=0;
	WHILE(a!=drawCallEnd,1)
	{
#if !defined(BYPASS_CACHE)
		CDXPSRDrawCall DrawCall;
		memcpy(&DrawCall,&m_DrawCalls[a],sizeof(CDXPSRDrawCall));
		a=NextDrawcallID(a);
#else
		a=NextDCByVtxPos(dcVtxPosCur, dcVtxPosUnused, dcVtxPosEnd);

		CDXPSRDrawCall& __restrict DrawCall = DrawCallBuf[curDrawCallBuf];
//		memtransfer_sync_id(curDrawCallBuf);
		curDrawCallBuf ^= 1;
		memtransfer_from_main(&DrawCallBuf[curDrawCallBuf],&m_DrawCalls[a],sizeof(CDXPSRDrawCall),curDrawCallBuf);
#endif
		IF(DrawCall.VertexStride()==24 && DrawCall.VertexCount()<DXPSRAS_VBC_BlockCount*DXPSRAS_VBC_BlockSize,1)
		{
			bool Quickpath;
			if(pBuffer==DrawCall.VtxBufferPos())
			{
				Quickpath	=	true;
				Matrix44A InvMat=ViewMat;
				InvMat.Invert();
				ViewMat	=	InvMat*DrawCall.ViewMat();
				Fast++;
			}
			else
			{
				Quickpath	=	false;
				ViewMat	=	DrawCall.ViewMat();
				Middle++;
			}
			Rasterize24FullCached(SPU_LOCAL_REF(ViewMat),
														SPU_MAIN_PTR(DrawCall.IndexBuffer()),
														DrawCall.IndexCount(),
														SPU_MAIN_PTR(DrawCall.VtxBufferPos()),
														DrawCall.VertexStride(),
														DrawCall.VertexCount(),
														Quickpath);
			ViewMat	=	DrawCall.ViewMat();
			pBuffer	=	DrawCall.VtxBufferPos();
		}
		else
		{
			Slow++;
			Rasterize(SPU_LOCAL_REF(DrawCall.ViewMat()),
								SPU_MAIN_PTR(DrawCall.IndexBuffer()),
								DrawCall.IndexCount(),
								SPU_MAIN_PTR(DrawCall.VtxBufferPos()),
								DrawCall.VertexStride());
			//if(DrawCall.VertexStride()!=24)
			//	printf("VertStr%d\n",DrawCall.VertexStride());
			pBuffer	=	0;
		}
#ifdef DXPSR_PROFILE_TIMINGS
		g_Stats.m_CountDrawcalls++;
		g_Stats.m_CountTriangles	+=	DrawCall.IndexCount();
#endif
	}
#if defined(BYPASS_CACHE)
//	memtransfer_sync_id(curDrawCallBuf);//need to sync last draw as it copies onto stack
#endif
	CopyZBuffer(rJob);

	//printf("%d %d=%f\n",Hit,g_Stats.m_CountDrawcalls,static_cast<float>(Hit)/static_cast<float>(g_Stats.m_CountDrawcalls));

	//printf("Fast/Middle/Slow:%d/%d/%d\n",Fast,Middle,Slow);
#if defined(DXPSR_PROFILE_DETAILED)
	if(DXPSRAS_GLOBAL_PERFCOUNTER_F1)
		printf("F %d %d %f %d\n",DXPSRAS_GLOBAL_PERFCOUNTER_F0,DXPSRAS_GLOBAL_PERFCOUNTER_F1,DXPSRAS_GLOBAL_PERFCOUNTER_F0*40.f/3.2f/1000000.f,DXPSRAS_GLOBAL_PERFCOUNTER_F0*40/DXPSRAS_GLOBAL_PERFCOUNTER_F1);
	if(DXPSRAS_GLOBAL_PERFCOUNTER_C1)
		printf("C %d %d %f %d\n",DXPSRAS_GLOBAL_PERFCOUNTER_C0,DXPSRAS_GLOBAL_PERFCOUNTER_C1,DXPSRAS_GLOBAL_PERFCOUNTER_C0*40.f/3.2f/1000000.f,DXPSRAS_GLOBAL_PERFCOUNTER_C0*40/DXPSRAS_GLOBAL_PERFCOUNTER_C1);
	if(DXPSRAS_GLOBAL_PERFCOUNTER_S1)
		printf("S %d %d %f %d\n",DXPSRAS_GLOBAL_PERFCOUNTER_S0,DXPSRAS_GLOBAL_PERFCOUNTER_S1,DXPSRAS_GLOBAL_PERFCOUNTER_S0*40.f/3.2f/1000000.f,DXPSRAS_GLOBAL_PERFCOUNTER_S0*40/DXPSRAS_GLOBAL_PERFCOUNTER_S1);
	if(DXPSRAS_GLOBAL_PERFCOUNTER_W1)
		printf("W %d %d %f %d\n",DXPSRAS_GLOBAL_PERFCOUNTER_W0,DXPSRAS_GLOBAL_PERFCOUNTER_W1,DXPSRAS_GLOBAL_PERFCOUNTER_W0*40.f/3.2f/1000000.f,DXPSRAS_GLOBAL_PERFCOUNTER_W0*40/DXPSRAS_GLOBAL_PERFCOUNTER_W1);
#endif
#ifdef DXPSR_PROFILE_TIMINGS
	memcpy(&m_Stats,&g_Stats,sizeof(SDXPSRasStats));
#endif
}
#endif //__SPU__
#endif //CRY_DXPS_RASTERTHREAD
