////////////////////////////////////////////////////////////////////////////
//
//  Crytek Engine Source File.
//  Copyright (C), Crytek Studios, 2002.
// -------------------------------------------------------------------------
//  File name:   COcclusionCuller.h
//  Version:     v1.00
//  Created:     04/01/2008 by Michael Kopietz
//  Compilers:   Visual Studio.NET
//  Description: Occlusion Culler using hardware generated ZBuffer
// -------------------------------------------------------------------------
//  History:
//
////////////////////////////////////////////////////////////////////////////
#ifndef _ZBUFFERCULLER_H
#define _ZBUFFERCULLER_H

#if defined(XENON)

#define XENON_CULLER_VECTORIZED 1

#endif

#include <Cry_Math.h>
struct IRenderMesh;

typedef	uint16			TZBZexel;
const uint64				TZB_MAXDEPTH	=	(1<<(sizeof(TZBZexel)*8))-1;

// equivalent to MIN(a, b); [branchless]
ILINE int min_branchless(int a, int b)
{
	int diff = a - b;
	int mask = diff >> 31; // sign extend
	return (b & (~mask)) | (a & mask); 
}


class CZBufferCuller : public Cry3DEngineBase
{
protected:
	bool											m_DebugFreez;
	uint32										m_SizeX;
	uint32										m_SizeY;
	f32												m_fSizeX;
	f32												m_fSizeY;
	f32												m_fSizeZ;
	SPU_DOMAIN_LOCAL TZBZexel* m_ZBuffer;
	Matrix44									m_MatProj;
	Matrix44									m_MatView;
	Matrix44									m_MatViewProj;
	Matrix44A									m_MatViewProjT;
	Vec3											m_Position;
#if defined(PS3)
	CZBufferCuller*						m_pCullerPPU;
	TZBZexel*									m_ZBufferPPU;
	qword											m_VMin;
	qword											m_VMax;
#endif
#if defined(XENON)
	XMVECTOR									m_VMin;
	XMVECTOR									m_VMax;
#endif
	int32											m_Bias;
	uint32										m_RotationSafe;
	uint32										m_AccurateTest;
	uint32										m_Treshold;

	f32												m_FrameTime;
	f32												m_FixedZFar;
	uint32										m_ObjectsTested;
	uint32										m_ObjectsTestedAndRejected;
	CCamera										m_Camera;
	int												m_OutdoorVisible;

	template<uint32 ROTATE,class T>
	bool											Rasterize(const T rVertices,const uint32	VCount)
														{
															int64	MinX=m_SizeX;
															int64	MaxX=0;
															int64	MinY=m_SizeY;
															int64	MaxY=0;
															int64 MinZ=TZB_MAXDEPTH;
															for(uint32 a=0;a<VCount;a++)
															{
																Vec4	V	=	rVertices[a];
																const f32 InvW	=	1.f/V.w;
																int64	X	=	static_cast<int64>((V.x*InvW*0.5f+0.5f)*m_fSizeX+0.5f);
																int64	Y	=	static_cast<int64>((V.y*InvW*0.5f+0.5f)*m_fSizeY+0.5f);
																int64 Z	=	static_cast<int64>(V.z*InvW*m_fSizeZ);
																if(X<MinX)
																	MinX=X;
																else
																if(X>MaxX)
																	MaxX=X;
																if(Y<MinY)
																	MinY=Y;
																else
																if(Y>MaxY)
																	MaxY=Y;
																if(Z<MinZ)
																	MinZ=Z;
															}
															if(MinX<0)
															{
																if(ROTATE==1)
																	return true;
																MinX=0;
															}
															if(MaxX>m_SizeX)
															{
																if(ROTATE==1)
																	return true;
																MaxX=m_SizeX;
															}
															if(MinY<0)
															{
																if(ROTATE==1)
																	return true;
																MinY=0;
															}
															if(MaxY>m_SizeY)
															{
																if(ROTATE==1)
																	return true;
																MaxY=m_SizeY;
															}
															if(ROTATE==2)
															{
																if(MinX>=m_SizeX ||	MinY>=m_SizeY || MaxX<0 ||	MaxX<0)
																	return true;
															}
															for(int64 y=MinY;y<MaxY;y++)
															for(int64 x=MinX;x<MaxX;x++)
																if(static_cast<int64>(m_ZBuffer[static_cast<int32>(x)+static_cast<int32>(y)*m_SizeX])>MinZ)
																	return true;
/*
															if(MinX<0)
																MinX=0;
															if(MaxX>m_SizeX-1)
																MaxX=m_SizeX-1;
															if(MinY<0)
																MinY=0;
															if(MaxY>m_SizeY-1)
																MaxY=m_SizeY-1;

															for(int64 x=MinX;x<MaxX;x++)
																if(m_ZBuffer[x+MinY*m_SizeX]<MinZ)
																	m_ZBuffer[x+MinY*m_SizeX]=MinZ;
															for(int64 x=MinX;x<MaxX;x++)
																if(m_ZBuffer[x+MaxY*m_SizeX]<MinZ)
																	m_ZBuffer[x+MaxY*m_SizeX]=MinZ;
															for(int64 y=MinY;y<MaxY;y++)
																if(m_ZBuffer[MinX+y*m_SizeX]<MinZ)
																	m_ZBuffer[MinX+y*m_SizeX]=MinZ;
															for(int64 y=MinY;y<MaxY;y++)
																if(m_ZBuffer[MaxX+y*m_SizeX]<MinZ)
																	m_ZBuffer[MaxX+y*m_SizeX]=MinZ;
*/
															return false;
														}



	bool											IsBoxVisible_OCCLUDER(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_OCEAN(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_OCCELL(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_OCCELL_OCCLUDER(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_OBJECT(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_OBJECT_TO_LIGHT(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_TERRAIN_NODE(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible_PORTAL(const AABB& objBox, uint32* const __restrict pResDest = NULL);
	bool											IsBoxVisible(const AABB& objBox, uint32* const __restrict pResDest = NULL);


public:
#ifndef __SPU__
														CZBufferCuller();
														~CZBufferCuller(){CryModuleMemalignFree(m_ZBuffer);}
#else
														CZBufferCuller(SPU_DOMAIN_MAIN const CZBufferCuller* pPPUCuller)
														{
															memtransfer_from_main(this, pPPUCuller, sizeof(*this), 0);
															memtransfer_sync(0);
															m_ZBufferPPU = m_ZBuffer;
															m_pCullerPPU = (CZBufferCuller*) pPPUCuller;
														}
														~CZBufferCuller()
														{
															TZBZexel* pLocalZBuffer = m_ZBuffer;
															m_ZBuffer = m_ZBufferPPU;
															memtransfer_to_main(m_pCullerPPU, this, sizeof(*this), 0);
															memtransfer_to_main(m_ZBufferPPU, pLocalZBuffer, sizeof(TZBZexel)*SelRes()*SelRes(), 0);
															memtransfer_sync(0);
														}
	void											ZBuffer(TZBZexel* pLocalZBuffer)
														{
															m_ZBuffer = pLocalZBuffer;
														}
#endif

	// start new frame
	void											BeginFrame(const CCamera& rCam);
	void											ReloadBuffer(const uint32 BufferID);

	// render into buffer
	ILINE void								AddRenderMesh(IRenderMesh * pRM, Matrix34A* pTranRotMatrix, IMaterial * pMaterial, bool bOutdoorOnly, bool bCompletelyInFrustum,bool bNoCull){}
	ILINE void								AddHeightMap(const struct SRangeInfo & m_rangeInfo, float X1, float Y1, float X2, float Y2){}

	// test visibility
#if defined(XENON)
	static const XMVECTORU8	s_startPermuteMask[8];
	static const XMVECTORU8	s_endPermuteMask[8];

	template<uint32 ROTATETEST>
	ILINE bool								IsObjectVisible(const AABB& objBox)
	{
		//The majority of this code is a conversion of the PS3 vectorized code
		static const	XMVECTORU8 qY			=	{0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07};
		static const	XMVECTORU8 qZ			=	{0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B};
		static const	XMVECTORU8 qW			=	{0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F};
		const XMVECTOR vZero = XMVectorZero();

		CryPrefetch(&m_MatViewProjT.m00);
		CryPrefetch(&m_MatViewProjT.m10);
		CryPrefetch(&m_MatViewProjT.m20);
		CryPrefetch(&m_MatViewProjT.m30);

		const	XMVECTOR	MinX		=	XMVectorSplatX(__loadunalignedvector(&objBox.min.x));
		const	XMVECTOR	MinY		=	XMVectorSplatX(__loadunalignedvector(&objBox.min.y));
		const	XMVECTOR	MinZ		=	XMVectorSplatX(__loadunalignedvector(&objBox.min.z));
		const	XMVECTOR	MaxX		=	XMVectorSplatX(__loadunalignedvector(&objBox.max.x));
		const	XMVECTOR	MaxY		=	XMVectorSplatX(__loadunalignedvector(&objBox.max.y));
		const	XMVECTOR	MaxZ		=	XMVectorSplatX(__loadunalignedvector(&objBox.max.z));

#ifdef XENON_INTRINSICS
		const XMVECTOR M0	=	reinterpret_cast<XMVECTOR*>(&m_MatViewProjT)[0];
		const XMVECTOR M1	=	reinterpret_cast<XMVECTOR*>(&m_MatViewProjT)[1];
		const XMVECTOR M2	=	reinterpret_cast<XMVECTOR*>(&m_MatViewProjT)[2];
		const XMVECTOR M3	=	reinterpret_cast<XMVECTOR*>(&m_MatViewProjT)[3];
#else
		const XMVECTOR M0	=	__loadunalignedvector(&m_MatViewProjT.m00);
		const XMVECTOR M1	=	__loadunalignedvector(&m_MatViewProjT.m10);
		const XMVECTOR M2	=	__loadunalignedvector(&m_MatViewProjT.m20);
		const XMVECTOR M3	=	__loadunalignedvector(&m_MatViewProjT.m30);
#endif

		XMVECTOR V0	=	XMVectorMultiplyAdd(M0,MinX,XMVectorMultiplyAdd(M1,MinY,XMVectorMultiplyAdd(M2,MinZ,M3)));
		XMVECTOR V1	=	XMVectorMultiplyAdd(M0,MinX,XMVectorMultiplyAdd(M1,MaxY,XMVectorMultiplyAdd(M2,MinZ,M3)));
		XMVECTOR V2	=	XMVectorMultiplyAdd(M0,MaxX,XMVectorMultiplyAdd(M1,MinY,XMVectorMultiplyAdd(M2,MinZ,M3)));
		XMVECTOR V3	=	XMVectorMultiplyAdd(M0,MaxX,XMVectorMultiplyAdd(M1,MaxY,XMVectorMultiplyAdd(M2,MinZ,M3)));
		const XMVECTOR V0or1 = __vor(V0,V1);
		XMVECTOR V4	=	XMVectorMultiplyAdd(M0,MinX,XMVectorMultiplyAdd(M1,MinY,XMVectorMultiplyAdd(M2,MaxZ,M3)));
		const XMVECTOR V2or3 = __vor(V2, V3);
		XMVECTOR V5	=	XMVectorMultiplyAdd(M0,MinX,XMVectorMultiplyAdd(M1,MaxY,XMVectorMultiplyAdd(M2,MaxZ,M3)));
		XMVECTOR V6	=	XMVectorMultiplyAdd(M0,MaxX,XMVectorMultiplyAdd(M1,MinY,XMVectorMultiplyAdd(M2,MaxZ,M3)));
		const XMVECTOR V4or5 = __vor(V4, V5);
		XMVECTOR V7	=	XMVectorMultiplyAdd(M0,MaxX,XMVectorMultiplyAdd(M1,MaxY,XMVectorMultiplyAdd(M2,MaxZ,M3)));

// 		XMVECTOR	Mask =__vor(
// 				__vor(__vor(V0,V1), __vor(V2,V3)),
// 				__vor(__vor(V4,V5), __vor(V6,V7)));

		XMVECTOR	Mask = XMVectorSplatW(__vor(
			__vor(V0or1, V2or3),
			__vor(V4or5, __vor(V6,V7))));

		XMVECTOR V0W	=	__vperm(V0,V0,qW);
		XMVECTOR V1W	=	__vperm(V1,V1,qW);
		XMVECTOR V2W	=	__vperm(V2,V2,qW);
		XMVECTOR V3W	=	__vperm(V3,V3,qW);
		XMVECTOR V4W	=	__vperm(V4,V4,qW);
		XMVECTOR V5W	=	__vperm(V5,V5,qW);
		XMVECTOR V6W	=	__vperm(V6,V6,qW);
		XMVECTOR V7W	=	__vperm(V7,V7,qW);

#if 1
		//Use higher precision
		V0W = XMVectorReciprocal(V0W);
		V1W = XMVectorReciprocal(V1W);
		V2W = XMVectorReciprocal(V2W);
		V3W = XMVectorReciprocal(V3W);
		V4W = XMVectorReciprocal(V4W);
		V5W = XMVectorReciprocal(V5W);
		V6W = XMVectorReciprocal(V6W);
		V7W = XMVectorReciprocal(V7W);
#else
		V0W = __vrefp(V0W);
		V1W = __vrefp(V1W);
		V2W = __vrefp(V2W);
		V3W = __vrefp(V3W);
		V4W = __vrefp(V4W);
		V5W = __vrefp(V5W);
		V6W = __vrefp(V6W);
		V7W = __vrefp(V7W);
#endif

		V0	=	XMVectorMultiply(V0, V0W);
		V1	=	XMVectorMultiply(V1, V1W);
		V2	=	XMVectorMultiply(V2, V2W);
		V3	=	XMVectorMultiply(V3, V3W);
		V4	=	XMVectorMultiply(V4, V4W);
		V5	=	XMVectorMultiply(V5, V5W);
		V6	=	XMVectorMultiply(V6, V6W);
		V7	=	XMVectorMultiply(V7, V7W);

		XMVECTOR MinV01	=	__vsel(V0,V1,__vcmpgtfp(V0,V1));
		XMVECTOR MinV23	=	__vsel(V2,V3,__vcmpgtfp(V2,V3));
		XMVECTOR MinV45	=	__vsel(V4,V5,__vcmpgtfp(V4,V5));
		XMVECTOR MinV67	=	__vsel(V6,V7,__vcmpgtfp(V6,V7));
		XMVECTOR MaxV01	=	__vsel(V0,V1,__vcmpgtfp(V1,V0));
		XMVECTOR MaxV23	=	__vsel(V2,V3,__vcmpgtfp(V3,V2));
		XMVECTOR MaxV45	=	__vsel(V4,V5,__vcmpgtfp(V5,V4));
		XMVECTOR MaxV67	=	__vsel(V6,V7,__vcmpgtfp(V7,V6));

		//The Vector -> int pipeline transfer is currently causing a load hit store
		//avoid possible nan

		unsigned int CR = 0;
		__vcmpgtswR(vZero, Mask, &CR);

		IF((CR & (1 << 7)), 0)
		{
			return true;
		}

		XMVECTOR MinV0123	=	__vsel(MinV01,MinV23,__vcmpgtfp(MinV01,MinV23));
		XMVECTOR MinV4567	=	__vsel(MinV45,MinV67,__vcmpgtfp(MinV45,MinV67));
		XMVECTOR MaxV0123	=	__vsel(MaxV01,MaxV23,__vcmpgtfp(MaxV23,MaxV01));
		XMVECTOR MaxV4567	=	__vsel(MaxV45,MaxV67,__vcmpgtfp(MaxV67,MaxV45));

		XMVECTOR MinV	=	__vsel(MinV0123,MinV4567,__vcmpgtfp(MinV0123,MinV4567));
		XMVECTOR MaxV	=	__vsel(MaxV0123,MaxV4567,__vcmpgtfp(MaxV4567,MaxV0123));

		XMVECTOR MinMinMask	=	__vcmpgtfp(m_VMin,MinV);
		XMVECTOR MaxMaxMask	=	__vcmpgtfp(MaxV,m_VMax);
		
		static const XMVECTORU32 VMaskOffZW = {0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000};

		if(ROTATETEST==1)
		{
			XMVECTOR comp = __vand(__vor(MinMinMask,MaxMaxMask), VMaskOffZW);
			CR = 0;
			__vcmpgtuwR(comp, vZero, &CR);
		
			if(!(CR & (1 << 5)))
				return true;
		}

		if(ROTATETEST==2)
		{
			XMVECTOR _MinMinMask		=	__vcmpgtfp(m_VMin,MaxV);
			XMVECTOR _MaxMaxMask		=	__vcmpgtfp(MinV,m_VMax);
			
			XMVECTOR comp = __vand(__vor(_MinMinMask,_MaxMaxMask), VMaskOffZW);
			CR = 0;
			__vcmpgtuwR(comp, vZero, &CR);

			if(!(CR & (1 << 5)))
				return true;
		}

		MinV	=	__vsel(MinV,m_VMin,MinMinMask);
		MaxV	=	__vsel(MaxV,m_VMax,MaxMaxMask);

		XMVECTOR	qMinZ		=	__vctuxs(MinV,16);

		MinV	=	__vctuxs(MinV,0);
		MaxV	=	__vctuxs(MaxV,0);

		XMVECTOR MinV_STACK, MaxV_STACK;

		__stvx_volatile(MinV, (volatile XMVECTOR *)&MinV_STACK, 0);
		__stvx_volatile(MaxV, (volatile XMVECTOR *)&MaxV_STACK, 0);
	
		const uint32	Maxy	=	XMVectorGetIntY(MaxV_STACK);
		const uint32	Maxx	=	XMVectorGetIntX(MaxV_STACK);
		const uint32	Minx	=	XMVectorGetIntX(MinV_STACK);
		const uint32	Miny	=	XMVectorGetIntY(MinV_STACK);
		
		const int32 startOffset = (Miny * m_SizeX) + Minx;
			
		int32 xDiff = Maxx - Minx;

		CryPrefetch(&m_ZBuffer[startOffset]);
		CryPrefetch(&m_ZBuffer[startOffset + 8]);

		if(xDiff == 0)
			return false;
		else if(xDiff >= 4)
		{
			//The below is all old code, but potentially useful for debugging purposes.
			//uint16 MinZ16 = ((uint16)MAX(Minz, 0)); //We can occasionally end up with large negative numbers as the MinZ
			//Splat the Minimum Z value into all 8 elements of an 8 x u16 vector
			//XMVECTOR vMinZ = __vor(__lvlx(&MinZ16, 0), __lvrx(&MinZ16,16));
			//XMVECTOR vMinZSplat = __vsplth(vMinZ, 0);

			//NOTE: This is currently hardcoded as a bias as 1, ignoring the coverage buffer bias console variable. This is for
			//			performance reasons
			static const XMVECTORU8 vBias = {0x0, 0x1, 0x0, 0x01, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1};

			XMVECTOR vMinZSplat = XMVectorSubtract(__vsplth(qMinZ, 5), vBias);

			//Iterate over all lines
			for(uint32 y=Miny;y<Maxy;y++)
			{
				uint32 x=Minx;

				//Get the number of iterations required to bring us into 16-byte alignment
				int yOffset = static_cast<int32>(y * m_SizeX);

				//Prefetch the start of the next line
				CryPrefetch(&m_ZBuffer[yOffset + m_SizeX + Minx]);

				//This should ensure that after the first iteration we are 16-byte aligned
				for(; x < Maxx; x = (x & ~(0x7)) + 8)
				{
					const int startLookupIndex	= x & 0x7;

					const int index = static_cast<int32>(x) + yOffset;

					const int alignedStartIndex = x - startLookupIndex;
					//This will be an aligned load, which is fine - we'll mask any problem areas off

					XMVECTOR vZBuffer = __lvx(&m_ZBuffer[index], 0);

					CryPrefetch(&m_ZBuffer[index + 8]);

					const XMVECTOR startPermuteMask = s_startPermuteMask[startLookupIndex];

					const int endLookupIndex = (min_branchless(Maxx, alignedStartIndex + 8) & 0x7);	//Value is from 0 to 7

					vZBuffer = __vperm(vZBuffer, vZero, startPermuteMask);

					const XMVECTOR endPermuteMask		= s_endPermuteMask[endLookupIndex];																	

					vZBuffer = __vperm(vZero, vZBuffer, endPermuteMask);

					CR = 0;
					__vcmpgtuhR(vZBuffer, vMinZSplat, &CR);

					//bit 5 on the CR is set if all of vZBuffer are less than or equal to vMinZSplat
					//  so the branch is taken if any of the vZBuffer values are greater than vMinZSplat
					if(!(CR & (1 << 5)))
					{
						return true;
					}
				}
			}
		}
		else
		{
			//If there are a low number of comparisons to be carried out, the overhead of the vector code
			//	is not worth it, so just use the scalar code

			const uint32	UMinz	=	XMVectorGetIntZ(qMinZ);
			int32	Minz	=	((int)UMinz) - m_Bias;

			for(int64 y=Miny;y<Maxy;y++)
			{
				int32 yOffset = static_cast<int32>(y)*m_SizeX;

				int32 prefetchOffset = yOffset + m_SizeX + Minx;
				
				CryPrefetch(&m_ZBuffer[prefetchOffset]);
				//CryPrefetch(&m_ZBuffer[prefetchOffset + xDiff]);
				CryPrefetch(&m_ZBuffer[prefetchOffset + 8]);

				for(int64 x=Minx;x<Maxx;x++)
				{
					int index = static_cast<int32>(x) + yOffset;
					CryPrefetch(&m_ZBuffer[index+16]);
					if(static_cast<int64>(m_ZBuffer[index])>Minz)
						return true;
				}
			}
		}		

		return false;
	}
#endif



#if defined(PS3)
	ILINE	uint32							VSignMask(qword V)
														{
//															return __si_to_int(__si_gb(V));
															return __si_to_int(__si_gb(__si_rotmai(V,-31)));
														}

	ILINE	uint32							VAccumulate16(qword V)
														{
															const qword MASK0	= (qword){0x08,0x09,0x0A,0x0B, 0x0C,0x0D,0x0E,0x0F, 0x08,0x09,0x0A,0x0B, 0x0C,0x0D,0x0E,0x0F};
															const qword MASK1	= (qword){0x04,0x05,0x06,0x07, 0x04,0x05,0x06,0x07, 0x04,0x05,0x06,0x07, 0x04,0x05,0x06,0x07};
															const qword MASK2	= (qword){0x02,0x03,0x02,0x03, 0x02,0x03,0x02,0x03, 0x02,0x03,0x02,0x03, 0x02,0x03,0x02,0x03};
															V	=	__si_ah(V,__si_shufb(V,V,MASK0));
															V	=	__si_ah(V,__si_shufb(V,V,MASK1));
															V	=	__si_ah(V,__si_shufb(V,V,MASK2));
															return __si_to_int(V)>>16;
														}
	ILINE qword								VDiv(qword a,qword b)
														{
#ifdef __SPU__
															qword c	=	__si_fi(b,__si_frest(b));
															qword d	=	__si_fma(__si_fnms(b,c,si_ilhu(0x3F80)),c,c);
#else
															qword d	=	__si_fi(b,__si_frest(b));
#endif
															return __si_fm(a,d);
														}

	qword											Rasterize(const qword V0,const qword V1,const qword V2)
														{
															const qword xxxx 				= (qword){0x00,0x01,0x02,0x03, 0x00,0x01,0x02,0x03, 0x10,0x11,0x12,0x13, 0x10,0x11,0x12,0x13};
															const qword yyyy 				= (qword){0x04,0x05,0x06,0x07, 0x04,0x05,0x06,0x07, 0x14,0x15,0x16,0x17, 0x14,0x15,0x16,0x17};
															const qword zzzz 				= (qword){0x08,0x09,0x0A,0x0B, 0x08,0x09,0x0A,0x0B, 0x18,0x19,0x1A,0x1B, 0x18,0x19,0x1A,0x1B};
															const qword wwww 				= (qword){0x0C,0x0D,0x0E,0x0F, 0x0C,0x0D,0x0E,0x0F, 0x1C,0x1D,0x1E,0x1F, 0x1C,0x1D,0x1E,0x1F};
															const qword zwzw 				= (qword){0x08,0x09,0x0A,0x0B, 0x0C,0x0D,0x0E,0x0F, 0x18,0x19,0x1A,0x1B, 0x1C,0x1D,0x1E,0x1F};
															const qword ywxx 				= (qword){0x04,0x05,0x06,0x07, 0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13, 0x10,0x11,0x12,0x13};
															const qword xyww 				= (qword){0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x1C,0x1D,0x1E,0x1F, 0x1C,0x1D,0x1E,0x1F};
															const qword xzww 				= (qword){0x00,0x01,0x02,0x03, 0x08,0x09,0x0A,0x0B, 0x1C,0x1D,0x1E,0x1F, 0x1C,0x1D,0x1E,0x1F};
															const qword wxyy 				= (qword){0x0C,0x0D,0x0E,0x0F, 0x00,0x01,0x02,0x03, 0x14,0x15,0x16,0x17, 0x14,0x15,0x16,0x17};
															const	qword qSPLAT			=	(qword){0x00,0x01,0x02,0x03, 0x00,0x01,0x02,0x03, 0x00,0x01,0x02,0x03, 0x00,0x01,0x02,0x03};
															//const	qword ZInterleave	=	(qword){0x02,0x03,0x06,0x07, 0x0A,0x0B,0x0E,0x0F, 0x12,0x13,0x16,0x17, 0x1A,0x1B,0x1E,0x1F};
															const	qword ZInterleave	=	(qword){0x00,0x01,0x04,0x05, 0x08,0x09,0x0C,0x0D, 0x10,0x11,0x14,0x15, 0x18,0x19,0x1C,0x1D};
															const qword qFFFF				=	(qword)(vec_uint4){~0,~0,~0,~0};

															qword Det4	=	__si_fm(__si_shufb(V0,V0,xyww),__si_fms(__si_shufb(V2,V2,wxyy),__si_shufb(V1,V1,ywxx),__si_fm(__si_shufb(V1,V1,wxyy),__si_shufb(V2,V2,ywxx))));
															Det4			=	__si_fa(__si_fa(__si_shufb(Det4,Det4,xxxx),__si_shufb(Det4,Det4,yyyy)),__si_shufb(Det4,Det4,zzzz));

															qword Count				=	(qword)(vec_int4){0,0,0,0};
															if(VSignMask(Det4)!=0)
																return Count;

															int32 MinX	=	0;
															int32 MinY	=	0;
															int32 MaxX	=	m_SizeX;
															int32 MaxY	=	m_SizeY;


															const	qword	Vywxx		=	__si_shufb(V1,V1,ywxx);
															const	qword	Vwxyy		=	__si_shufb(V1,V1,wxyy);
															const	qword	Vxzww		=	__si_shufb(V1,V1,xzww);
															const	qword	InvMV0	=	__si_fms(__si_shufb(V1,V1,ywxx),__si_shufb(V2,V2,wxyy),__si_fm(__si_shufb(V1,V1,wxyy),__si_shufb(V2,V2,ywxx)));
															const	qword	InvMV1	=	__si_fms(__si_shufb(V0,V0,wxyy),__si_shufb(V2,V2,ywxx),__si_fm(__si_shufb(V0,V0,ywxx),__si_shufb(V2,V2,wxyy)));
															const	qword	InvMV2	=	__si_fms(__si_shufb(V0,V0,ywxx),__si_shufb(V1,V1,wxyy),__si_fm(__si_shufb(V0,V0,wxyy),__si_shufb(V1,V1,ywxx)));

															const	qword	V0x	=	__si_shufb(InvMV0,InvMV0,xxxx);
															const	qword	V0y	=	__si_shufb(InvMV0,InvMV0,yyyy);
															const	qword	V0z	=	__si_shufb(InvMV0,InvMV0,zzzz);
															const	qword	V1x	=	__si_shufb(InvMV1,InvMV1,xxxx);
															const	qword	V1y	=	__si_shufb(InvMV1,InvMV1,yyyy);
															const	qword	V1z	=	__si_shufb(InvMV1,InvMV1,zzzz);
															const	qword	V2x	=	__si_shufb(InvMV2,InvMV2,xxxx);
															const	qword	V2y	=	__si_shufb(InvMV2,InvMV2,yyyy);
															const	qword	V2z	=	__si_shufb(InvMV2,InvMV2,zzzz);

															const	qword	Z0	=	__si_shufb(V0,V0,zzzz);
															const	qword	Z1	=	__si_shufb(V1,V1,zzzz);
															const	qword	Z2	=	__si_shufb(V2,V2,zzzz);
															const	qword	W0	=	__si_shufb(V0,V0,wwww);
															const	qword	W1	=	__si_shufb(V1,V1,wwww);
															const	qword	W2	=	__si_shufb(V2,V2,wwww);


															const uint32 STEPi		=	16;
															const float STEPf			=	static_cast<float>(STEPi)-1.f;
															const qword SWIZZLEY	=	(qword)(vec_float4){0.f,STEPf,0.f,STEPf};
															const qword SWIZZLEX	=	(qword)(vec_float4){0.f,0.f,STEPf,STEPf};
															const qword SWIZZLEx0	=	(qword)(vec_float4){0.f,1.f,2.f,3.f};
															const qword SWIZZLEx1	=	(qword)(vec_float4){4.f,5.f,6.f,7.f};

															for(int32 Y=MinY;Y<MaxY;Y+=STEPi)
															{
																const qword VY		=	__si_from_float(static_cast<float>(Y));
																const qword	Y4		=	__si_fa(__si_fa(__si_shufb(VY,VY,xxxx),qSPLAT),SWIZZLEY);
																const qword	TV0y	=	__si_fma(V0y,Y4,V0z);
																const qword	TV1y	=	__si_fma(V1y,Y4,V1z);
																const qword	TV2y	=	__si_fma(V2y,Y4,V2z);
																for(int32 X=MinX;X<MaxX;X+=STEPi)
																{
																	const qword VX		=	__si_from_float(static_cast<float>(X));
																	const qword	X4		=	__si_fa(__si_fa(__si_shufb(VX,VX,xxxx),qSPLAT),SWIZZLEX);
																	const qword	PX		=	__si_fma(V0x,X4,TV0y);
																	const qword	PY		=	__si_fma(V1x,X4,TV1y);
																	const qword	PZ		=	__si_fma(V2x,X4,TV2y);
																	if((VSignMask(PX)==15 | VSignMask(PY)==15) | (VSignMask(PZ)==15))
																		continue;
																	for(int32 y=Y,YE=Y+STEPi;y<YE;y++)
																	{
																		const qword VY		=	__si_from_float(static_cast<float>(y));
																		const qword	y4		=	__si_shufb(VY,VY,qSPLAT);
																		const qword	TV0y	=	__si_fma(V0y,y4,V0z);
																		const qword	TV1y	=	__si_fma(V1y,y4,V1z);
																		const qword	TV2y	=	__si_fma(V2y,y4,V2z);

																		for(int32 x=X,XE=X+STEPi;x<XE;x+=8)
																		{
																			const qword VX				=	__si_from_float(static_cast<float>(x));
																			const qword x4				=	__si_shufb(VX,VX,qSPLAT);
																			const qword	X0				=	__si_fa(x4,SWIZZLEx0);
																			const qword	X1				=	__si_fa(x4,SWIZZLEx1);
																			const qword	Px0				=	__si_fma(V0x,X0,TV0y);
																			const qword	Py0				=	__si_fma(V1x,X0,TV1y);
																			const qword	Pz0				=	__si_fma(V2x,X0,TV2y);
																			const qword	Px1				=	__si_fma(V0x,X1,TV0y);
																			const qword	Py1				=	__si_fma(V1x,X1,TV1y);
																			const qword	Pz1				=	__si_fma(V2x,X1,TV2y);
																			qword 			Mask0			=	__si_or(__si_or(Px0,Py0),Pz0);
																			qword 			Mask1			=	__si_or(__si_or(Px1,Py1),Pz1);
																			IF((VSignMask(__si_and(Mask0,Mask1)))==15,0)
																				continue;
																			Mask0	=	__si_rotmai(Mask0,-31);
																			Mask1	=	__si_rotmai(Mask1,-31);
																			Mask0	=	__si_xor(Mask0,qFFFF);
																			Mask1	=	__si_xor(Mask1,qFFFF);

																			volatile int32	Idx		=	x+y*m_SizeX;
																			qword& ZBuf	=	*reinterpret_cast<qword*>(&m_ZBuffer[Idx]);
																			volatile qword	w0		=	__si_fma(W0,Px0,__si_fma(W1,Py0,__si_fm(W2,Pz0)));
																			volatile qword	z0		=	VDiv(__si_fma(Z0,Px0,__si_fma(Z1,Py0,__si_fm(Z2,Pz0))),w0);
																			volatile qword	w1		=	__si_fma(W0,Px1,__si_fma(W1,Py1,__si_fm(W2,Pz1)));
																			volatile qword	z1		=	VDiv(__si_fma(Z0,Px1,__si_fma(Z1,Py1,__si_fm(Z2,Pz1))),w1);

																			volatile qword zi0		=	__si_cfltu(z0,32u);
																			volatile qword zi1		=	__si_cfltu(z1,32u);
																			volatile qword zi		=	__si_shufb(zi0,zi1,ZInterleave);

																			qword Mask	=	__si_shufb(Mask0,Mask1,ZInterleave);
																			qword ZMask	=	__si_clgth(ZBuf,zi);
																			ZMask	=	__si_and(ZMask,Mask);
																			Count	=	__si_sfh(Count,ZMask);
																		}
																	}
																}
															}
															return Count;
														}











	template<uint32 ROTATETEST>
	ILINE bool								IsObjectVisible2(AABB objBox)
														{
															
															/*const float BoxExtendX	=	(objBox.max.x-objBox.min.x)*0.1f+0.1f;
															const float BoxExtendY	=	(objBox.max.y-objBox.min.y)*0.1f+0.1f;
															const float BoxExtendZ	=	(objBox.max.z-objBox.min.z)*0.1f+0.1f;
															objBox.min.x	-=	BoxExtendX;
															objBox.min.y	-=	BoxExtendY;
															objBox.min.z	-=	BoxExtendZ;
															objBox.max.x	+=	BoxExtendX;
															objBox.max.y	+=	BoxExtendY;
															objBox.max.z	+=	BoxExtendZ;
															*/

															if(	m_Position.x>=objBox.min.x && m_Position.x<=objBox.max.x &&
																	m_Position.y>=objBox.min.y && m_Position.x<=objBox.max.y &&
																	m_Position.z>=objBox.min.z && m_Position.x<=objBox.max.z)
																	return true;

															const	qword qSPLAT	=	{0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03};
															const	qword qY			=	{0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07};
															const	qword qZ			=	{0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B};
															const	qword qW			=	{0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F};

															const	qword	MinX		=	__si_shufb(__si_from_float(objBox.min.x),__si_from_float(objBox.min.x),qSPLAT);
															const	qword	MinY		=	__si_shufb(__si_from_float(objBox.min.y),__si_from_float(objBox.min.y),qSPLAT);
															const	qword	MinZ		=	__si_shufb(__si_from_float(objBox.min.z),__si_from_float(objBox.min.z),qSPLAT);
															const	qword	MaxX		=	__si_shufb(__si_from_float(objBox.max.x),__si_from_float(objBox.max.x),qSPLAT);
															const	qword	MaxY		=	__si_shufb(__si_from_float(objBox.max.y),__si_from_float(objBox.max.y),qSPLAT);
															const	qword	MaxZ		=	__si_shufb(__si_from_float(objBox.max.z),__si_from_float(objBox.max.z),qSPLAT);

															const qword M0			=	reinterpret_cast<qword*>(&m_MatViewProjT)[0];
															const qword M1			=	reinterpret_cast<qword*>(&m_MatViewProjT)[1];
															const qword M2			=	reinterpret_cast<qword*>(&m_MatViewProjT)[2];
															const qword M3			=	reinterpret_cast<qword*>(&m_MatViewProjT)[3];


															const qword Vert0	=	__si_fma(M0,MinX,__si_fma(M1,MinY,__si_fma(M2,MinZ,M3)));
															const qword Vert1	=	__si_fma(M0,MinX,__si_fma(M1,MaxY,__si_fma(M2,MinZ,M3)));
															const qword Vert2	=	__si_fma(M0,MaxX,__si_fma(M1,MinY,__si_fma(M2,MinZ,M3)));
															const qword Vert3	=	__si_fma(M0,MaxX,__si_fma(M1,MaxY,__si_fma(M2,MinZ,M3)));
															const qword Vert4	=	__si_fma(M0,MinX,__si_fma(M1,MinY,__si_fma(M2,MaxZ,M3)));
															const qword Vert5	=	__si_fma(M0,MinX,__si_fma(M1,MaxY,__si_fma(M2,MaxZ,M3)));
															const qword Vert6	=	__si_fma(M0,MaxX,__si_fma(M1,MinY,__si_fma(M2,MaxZ,M3)));
															const qword Vert7	=	__si_fma(M0,MaxX,__si_fma(M1,MaxY,__si_fma(M2,MaxZ,M3)));

															const qword	Mask	=	__si_or(__si_or(__si_or(Vert0,Vert1),
																																	__si_or(Vert2,Vert3)),
																													__si_or(__si_or(Vert4,Vert5),
																																	__si_or(Vert6,Vert7)));

															const int NearestW      =     __si_to_int(__si_shufb(Mask,Mask,qW));
															IF(NearestW<0,0)
																return true;

															if(ROTATETEST)
															{
																const qword V0W	=	__si_shufb(Vert0,Vert0,qW);
																const qword V1W	=	__si_shufb(Vert1,Vert1,qW);
																const qword V2W	=	__si_shufb(Vert2,Vert2,qW);
																const qword V3W	=	__si_shufb(Vert3,Vert3,qW);
																const qword V4W	=	__si_shufb(Vert4,Vert4,qW);
																const qword V5W	=	__si_shufb(Vert5,Vert5,qW);
																const qword V6W	=	__si_shufb(Vert6,Vert6,qW);
																const qword V7W	=	__si_shufb(Vert7,Vert7,qW);

																const qword V0	=	__si_fm(Vert0,__si_fi(V0W,__si_frest(V0W)));
																const qword V1	=	__si_fm(Vert1,__si_fi(V1W,__si_frest(V1W)));
																const qword V2	=	__si_fm(Vert2,__si_fi(V2W,__si_frest(V2W)));
																const qword V3	=	__si_fm(Vert3,__si_fi(V3W,__si_frest(V3W)));
																const qword V4	=	__si_fm(Vert4,__si_fi(V4W,__si_frest(V4W)));
																const qword V5	=	__si_fm(Vert5,__si_fi(V5W,__si_frest(V5W)));
																const qword V6	=	__si_fm(Vert6,__si_fi(V6W,__si_frest(V6W)));
																const qword V7	=	__si_fm(Vert7,__si_fi(V7W,__si_frest(V7W)));
																const qword MinV01	=	__si_selb(V0,V1,__si_fcgt(V0,V1));
																const qword MinV23	=	__si_selb(V2,V3,__si_fcgt(V2,V3));
																const qword MinV45	=	__si_selb(V4,V5,__si_fcgt(V4,V5));
																const qword MinV67	=	__si_selb(V6,V7,__si_fcgt(V6,V7));
																const qword MaxV01	=	__si_selb(V0,V1,__si_fcgt(V1,V0));
																const qword MaxV23	=	__si_selb(V2,V3,__si_fcgt(V3,V2));
																const qword MaxV45	=	__si_selb(V4,V5,__si_fcgt(V5,V4));
																const qword MaxV67	=	__si_selb(V6,V7,__si_fcgt(V7,V6));

																const qword MinV0123	=	__si_selb(MinV01,MinV23,__si_fcgt(MinV01,MinV23));
																const qword MinV4567	=	__si_selb(MinV45,MinV67,__si_fcgt(MinV45,MinV67));
																const qword MaxV0123	=	__si_selb(MaxV01,MaxV23,__si_fcgt(MaxV23,MaxV01));
																const qword MaxV4567	=	__si_selb(MaxV45,MaxV67,__si_fcgt(MaxV67,MaxV45));

																const qword MinV	=	__si_selb(MinV0123,MinV4567,__si_fcgt(MinV0123,MinV4567));
																const qword MaxV	=	__si_selb(MaxV0123,MaxV4567,__si_fcgt(MaxV4567,MaxV0123));

																const qword MinMinMask	=	__si_fcgt(m_VMin,MinV);
																const qword MaxMaxMask	=	__si_fcgt(MaxV,m_VMax);

																if(ROTATETEST==1)
																{
																	int BoundMask			=	__si_to_int(__si_gb(__si_or(MinMinMask,MaxMaxMask)));
																	if((BoundMask>>2)!=0)
																		return true;
																}
																if(ROTATETEST==2)
																{
																	qword MinMinMask	=	__si_fcgt(m_VMin,MaxV);
																	qword MaxMaxMask	=	__si_fcgt(MinV,m_VMax);
																	int BoundMask			=	__si_to_int(__si_gb(__si_or(MinMinMask,MaxMaxMask)));
																	if((BoundMask>>2)!=0)
																		return true;
																}
															}



															qword qCount;
															qCount =								Rasterize(Vert2,Vert3,Vert7);
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert2,Vert7,Vert6));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert1,Vert0,Vert4));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert1,Vert4,Vert5));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert3,Vert1,Vert5));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert3,Vert5,Vert7));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert0,Vert2,Vert6));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert0,Vert6,Vert4));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert5,Vert4,Vert6));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert5,Vert6,Vert7));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert0,Vert1,Vert3));
															if(VAccumulate16(qCount)>m_Treshold)
																return true;
															qCount = __si_ah(qCount,Rasterize(Vert0,Vert3,Vert2));
															return VAccumulate16(qCount)>0?true:false;
														}

	template<uint32 ROTATETEST>
	ILINE bool								IsObjectVisible(const AABB& objBox)
														{
															const	qword qSPLAT	=	{0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03};
															const	qword qY			=	{0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07};
															const	qword qZ			=	{0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B,0x08,0x09,0x0A,0x0B};
															const	qword qW			=	{0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F,0x0C,0x0D,0x0E,0x0F};

															const	qword	MinX		=	__si_shufb(__si_from_float(objBox.min.x),__si_from_float(objBox.min.x),qSPLAT);
															const	qword	MinY		=	__si_shufb(__si_from_float(objBox.min.y),__si_from_float(objBox.min.y),qSPLAT);
															const	qword	MinZ		=	__si_shufb(__si_from_float(objBox.min.z),__si_from_float(objBox.min.z),qSPLAT);
															const	qword	MaxX		=	__si_shufb(__si_from_float(objBox.max.x),__si_from_float(objBox.max.x),qSPLAT);
															const	qword	MaxY		=	__si_shufb(__si_from_float(objBox.max.y),__si_from_float(objBox.max.y),qSPLAT);
															const	qword	MaxZ		=	__si_shufb(__si_from_float(objBox.max.z),__si_from_float(objBox.max.z),qSPLAT);

															const qword M0	=	reinterpret_cast<qword*>(&m_MatViewProjT)[0];
															const qword M1	=	reinterpret_cast<qword*>(&m_MatViewProjT)[1];
															const qword M2	=	reinterpret_cast<qword*>(&m_MatViewProjT)[2];
															const qword M3	=	reinterpret_cast<qword*>(&m_MatViewProjT)[3];




															qword V0	=	__si_fma(M0,MinX,__si_fma(M1,MinY,__si_fma(M2,MinZ,M3)));
															qword V1	=	__si_fma(M0,MinX,__si_fma(M1,MaxY,__si_fma(M2,MinZ,M3)));
															qword V2	=	__si_fma(M0,MaxX,__si_fma(M1,MinY,__si_fma(M2,MinZ,M3)));
															qword V3	=	__si_fma(M0,MaxX,__si_fma(M1,MaxY,__si_fma(M2,MinZ,M3)));
															qword V4	=	__si_fma(M0,MinX,__si_fma(M1,MinY,__si_fma(M2,MaxZ,M3)));
															qword V5	=	__si_fma(M0,MinX,__si_fma(M1,MaxY,__si_fma(M2,MaxZ,M3)));
															qword V6	=	__si_fma(M0,MaxX,__si_fma(M1,MinY,__si_fma(M2,MaxZ,M3)));
															qword V7	=	__si_fma(M0,MaxX,__si_fma(M1,MaxY,__si_fma(M2,MaxZ,M3)));

															qword	Mask	=	__si_or(__si_or(__si_or(V0,V1),
																												__si_or(V2,V3)),
																									__si_or(__si_or(V4,V5),
																												__si_or(V6,V7)));
														//	const float NearestW	=	si_to_float(__si_shufb(Mask,Mask,qW));
															const int NearestW      =     __si_to_int(__si_shufb(Mask,Mask,qW));//avoid possible nan
															IF(NearestW<0,0)
																return true;

														//	qword	Verts[8]	=	{V0,V1,V2,V3,V4,V5,V6,V7};
														//	return Rasterize2(reinterpret_cast<Vec4*>(Verts),8);
															qword V0W	=	__si_shufb(V0,V0,qW);
															qword V1W	=	__si_shufb(V1,V1,qW);
															qword V2W	=	__si_shufb(V2,V2,qW);
															qword V3W	=	__si_shufb(V3,V3,qW);
															qword V4W	=	__si_shufb(V4,V4,qW);
															qword V5W	=	__si_shufb(V5,V5,qW);
															qword V6W	=	__si_shufb(V6,V6,qW);
															qword V7W	=	__si_shufb(V7,V7,qW);


															//hack cause si_fi is not mapped at all for ppu
															V0	=	(qword)divf4((vec_float4)V0,(vec_float4)V0W);
															V1	=	(qword)divf4((vec_float4)V1,(vec_float4)V1W);
															V2	=	(qword)divf4((vec_float4)V2,(vec_float4)V2W);
															V3	=	(qword)divf4((vec_float4)V3,(vec_float4)V3W);
															V4	=	(qword)divf4((vec_float4)V4,(vec_float4)V4W);
															V5	=	(qword)divf4((vec_float4)V5,(vec_float4)V5W);
															V6	=	(qword)divf4((vec_float4)V6,(vec_float4)V6W);
															V7	=	(qword)divf4((vec_float4)V7,(vec_float4)V7W);
															//V0	=	si_fm(V0,si_fi(V0W,si_frest(V0W)));
															//V1	=	si_fm(V1,si_fi(V1W,si_frest(V1W)));
															//V2	=	si_fm(V2,si_fi(V2W,si_frest(V2W)));
															//V3	=	si_fm(V3,si_fi(V3W,si_frest(V3W)));
															//V4	=	si_fm(V4,si_fi(V4W,si_frest(V4W)));
															//V5	=	si_fm(V5,si_fi(V5W,si_frest(V5W)));
															//V6	=	si_fm(V6,si_fi(V6W,si_frest(V6W)));
															//V7	=	si_fm(V7,si_fi(V7W,si_frest(V7W)));
															qword MinV01	=	__si_selb(V0,V1,__si_fcgt(V0,V1));
															qword MinV23	=	__si_selb(V2,V3,__si_fcgt(V2,V3));
															qword MinV45	=	__si_selb(V4,V5,__si_fcgt(V4,V5));
															qword MinV67	=	__si_selb(V6,V7,__si_fcgt(V6,V7));
															qword MaxV01	=	__si_selb(V0,V1,__si_fcgt(V1,V0));
															qword MaxV23	=	__si_selb(V2,V3,__si_fcgt(V3,V2));
															qword MaxV45	=	__si_selb(V4,V5,__si_fcgt(V5,V4));
															qword MaxV67	=	__si_selb(V6,V7,__si_fcgt(V7,V6));

															qword MinV0123	=	__si_selb(MinV01,MinV23,__si_fcgt(MinV01,MinV23));
															qword MinV4567	=	__si_selb(MinV45,MinV67,__si_fcgt(MinV45,MinV67));
															qword MaxV0123	=	__si_selb(MaxV01,MaxV23,__si_fcgt(MaxV23,MaxV01));
															qword MaxV4567	=	__si_selb(MaxV45,MaxV67,__si_fcgt(MaxV67,MaxV45));

															qword MinV	=	__si_selb(MinV0123,MinV4567,__si_fcgt(MinV0123,MinV4567));
															qword MaxV	=	__si_selb(MaxV0123,MaxV4567,__si_fcgt(MaxV4567,MaxV0123));

															qword MinMinMask	=	__si_fcgt(m_VMin,MinV);
															qword MaxMaxMask	=	__si_fcgt(MaxV,m_VMax);

															if(ROTATETEST==1)
															{
																int BoundMask			=	__si_to_int(__si_gb(__si_or(MinMinMask,MaxMaxMask)));
																if((BoundMask>>2)!=0)
																	return true;
															}
															if(ROTATETEST==2)
															{
																qword MinMinMask	=	__si_fcgt(m_VMin,MaxV);
																qword MaxMaxMask	=	__si_fcgt(MinV,m_VMax);
																int BoundMask			=	__si_to_int(__si_gb(__si_or(MinMinMask,MaxMaxMask)));
																if((BoundMask>>2)!=0)
																	return true;
															}

															MinV	=	__si_selb(MinV,m_VMin,MinMinMask);
															MaxV	=	__si_selb(MaxV,m_VMax,MaxMaxMask);

															qword	qMinZ		=	__si_cfltu(MinV,16);
															int32	Minz	=	__si_to_int(__si_shufb(qMinZ,qMinZ,qZ));
														//	qword	qMinZ		=	__si_cfltu(__si_selb(MinV,qZLimit,__si_fcgt(qZLimit,MinV)),16);
														//	qword	Minz	=	__si_shufb(qMinZ,qMinZ,qSPLATh);

															MinV	=	__si_cfltu(MinV,0);
															MaxV	=	__si_cfltu(MaxV,0);

															uint32	Minx	=	__si_to_int(MinV);
															uint32	Maxx	=	__si_to_int(MaxV);
															uint32	Miny	=	__si_to_int(__si_shufb(MinV,MinV,qY));
															uint32	Maxy	=	__si_to_int(__si_shufb(MaxV,MaxV,qY));
															if(Minx==Maxx)
																return false;

														//	Minx	&=	7u;
															Minx	+=	Miny*m_SizeX;
															Maxx	+=	Miny*m_SizeX;
														//	Minz	-=	m_Bias;
															for(uint32 y=Miny;y<Maxy;y++,Minx+=m_SizeX,Maxx+=m_SizeX)
															for(uint32 x=Minx;x<Maxx;x++)
														//		if(static_cast<int64>(m_ZBuffer[x+y*m_SizeX])>Minz)
																if(m_ZBuffer[x]>Minz)
														//		if(__si_to_int(si_gbh(si_cgth(*reinterpret_cast<qword*>(&m_ZBuffer[x]),Minz)))!=0)
																	return true;

															return false;
														}
#endif

ILINE bool IsObjectVisible(const AABB& objBox, EOcclusionObjectType eOcclusionObjectType, float fDistance, uint32* pRetVal = NULL)
{
#if defined(PS3) || (defined(XENON) && XENON_CULLER_VECTORIZED)
  bool Ret;
	//TODOMK make accurate test after bounding rect returns 1
#if defined(PS3)
	if(m_AccurateTest)
	{
		if(m_RotationSafe==0)
			Ret	=	IsObjectVisible2<0>(objBox);
		else
		if(m_RotationSafe==1)
			Ret	=	IsObjectVisible2<1>(objBox);
		else
			Ret	=	IsObjectVisible2<2>(objBox);
	}
	else
#endif
	{
		if(m_RotationSafe==0)
			Ret	=	IsObjectVisible<0>(objBox);
		else
		if(m_RotationSafe==1)
			Ret	=	IsObjectVisible<1>(objBox);
		else
			Ret	=	IsObjectVisible<2>(objBox);
	}
  return Ret;
#else
	switch(eOcclusionObjectType)
	{
	case eoot_OCCLUDER:
		return IsBoxVisible_OCCLUDER(objBox, pRetVal);
	case eoot_OCEAN:
		return IsBoxVisible_OCEAN(objBox, pRetVal);
	case eoot_OCCELL:
		return IsBoxVisible_OCCELL(objBox, pRetVal);
	case eoot_OCCELL_OCCLUDER:
		return IsBoxVisible_OCCELL_OCCLUDER(objBox, pRetVal);
	case eoot_OBJECT:
		return IsBoxVisible_OBJECT(objBox, pRetVal);
	case eoot_OBJECT_TO_LIGHT:
		return IsBoxVisible_OBJECT_TO_LIGHT(objBox, pRetVal);
	case eoot_TERRAIN_NODE:
		return IsBoxVisible_TERRAIN_NODE(objBox, pRetVal);
	case eoot_PORTAL:
		return IsBoxVisible_PORTAL(objBox, pRetVal);
	}
	assert(!"Undefined occluder type");
#endif
	return true;
}

	bool											IsShadowcasterVisible(const AABB& objBox,Vec3 rExtrusionDir){return true;};
	// draw content to the screen for debug
	void											DrawDebug(int32 nStep);

	// update tree
	void											UpdateDepthTree(){};

	// return current camera
	const CCamera&						GetCamera() const {return m_Camera;}

	//set the scissor for clipping (0.f|0.f to 1.f|1.f)
/*	void											Scissor(f32 TopLeftX,f32 TopLeftY,f32 BottomRightX,f32 BottomRightY)
														{
															m_TopLeftX			=	TopLeftX*2.f-1.f;
															m_TopLeftY			=	TopLeftY*2.f-1.f;
															m_BottomRightX	=	BottomRightX*2.f-1.f;
															m_BottomRightY	=	BottomRightY*2.f-1.f;
														}
*/

	void											GetMemoryUsage(ICrySizer * pSizer) const;

	void											SetFrameTime(f32 fTime) 
														{
															m_FrameTime = fTime; 
														}
	ILINE f32									GetFrameTime(){return m_FrameTime;}
	bool											IsOutdooVisible(){return m_OutdoorVisible==1;}
	void											TrisWritten(int32){}
	int32											TrisWritten()const{return 0;}
	void											ObjectsWritten(int32){}
	int32											ObjectsWritten()const{return 0;}
	int32											TrisTested()const{return 0;}
	int32											ObjectsTested()const{return m_ObjectsTested;}
	int32											ObjectsTestedAndRejected()const{return m_ObjectsTestedAndRejected;}
	int32											SelRes()const{return m_SizeX;}
	float											FixedZFar()const{return m_FixedZFar;}
	float											GetZNearInMeters()const{return 0.f;}
	float											GetZFarInMeters()const{return 1024;}

} _ALIGN(128);

ILINE bool CZBufferCuller::IsBoxVisible_TERRAIN_NODE(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OCCELL_OCCLUDER(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OCCLUDER(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OCEAN(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OCCELL(const AABB& objBox, uint32* const __restrict pResDest)
{
	if(GetCVars()->e_CoverageBufferDebugFreeze)
		return true;
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OBJECT(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_OBJECT_TO_LIGHT(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

ILINE bool CZBufferCuller::IsBoxVisible_PORTAL(const AABB& objBox, uint32* const __restrict pResDest)
{
	return IsBoxVisible(objBox, pResDest);
}

#endif 
