/*
	relocatable and on demand linkable version of non inline libgcm calls
*/

#if defined(PS3)
#if defined(__SPU__)

#ifndef eCryModule
	#define eCryModule eCryM_Launcher
#endif

#define USE_FIXED_WRAP

#undef SPU_DEBUG_BREAK_ALWAYS
#define SPU_DEBUG_BREAK_ALWAYS

#define GCM_CMD_INJECTION_OFFSET (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmInjectBufOff)
#define GCM_CMD_START_OFFSET (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmCmdResetOffset)

#if !defined(_NO_SPU_ASSERT)
	#undef CELL_GCM_RESERVE
	#define CELL_GCM_RESERVE(a) assert((uint32)a + (uint32)GetGcmSPUData()->contextData.current - ((uint32)GetGcmSPUData()->contextData.begin + GetGcmSPUData()->localContextOffset) < LOCAL_SPU_CMD_BUF_SIZE)

	#undef CELL_GCM_ASSERT
	#define CELL_GCM_ASSERT(a) assert(a)

	#undef CELL_GCM_ASSERTS
	#define CELL_GCM_ASSERTS(a,mess) assert(a)
#endif

#include <CryModuleDefs.h>
#include <platform.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"
#include "../gcm_mapping.h"
#include <cell/gcm_spu.h>

using namespace cell::Gcm::UnsafeInline;

//#define FORCE_ASSERT
#ifdef FORCE_ASSERT
	#undef assert
	#define assert(cond) \
	do \
	{ \
		if (__builtin_expect(!(cond), 0)) \
	{ \
		printf("assert: %s:%d\n",__FILE__, __LINE__); \
		SPU_DEBUG_HALT; \
	} \
	} while (false)
#endif

#ifndef INVALID_Z_COUNT
	#define INVALID_Z_COUNT ((uint32)~0)
#endif

#define RSX_ADDRESS_BASE (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmRsxBaseAddress)//0xC0000000
#define RSX_RANGE (256 << 20)

#define Segment(Idx) (((Idx)/GCM_CMD_SEGMENTSIZE)&GCM_CMD_SEGMENTMASK)

//pay attention which tag is used, current tag: g_scDMAListTag is used for ps transfer
static void MemcpyMainAllSize(const uint32 cDest, const uint32 cSource, const uint32 cSize)
{
	IF(cSize == 0, false)
		return;
	assert(((uint32)cDest & 0xF) == ((uint32)cSource & 0xF));
	si_wrch(MFC_TagID,si_from_uint(g_scDMAListTag));
	IF(cSize >= 16, true)
	{
		assert(((uint32)cDest & 0xF) == 0);
		int sizeLeft = (int)cSize;
		uint32 curDest = (uint32)cDest;
		uint32 curSource = cSource;
		do
		{
			si_wrch(MFC_LSA,si_from_uint(curSource));
			si_wrch(MFC_EAL,si_from_uint(curDest));
			si_wrch(MFC_Size,si_from_uint((sizeLeft>16*1024)?16*1024 : sizeLeft));
			si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//initiate transfer
			sizeLeft	-= 16*1024;//transfer 16 KB-wise
			curSource	+= 16*1024;
			curDest		+= 16*1024;
		}
		WHILE(sizeLeft > 0, 0);
		return;
	}
	//common settings
	si_wrch(MFC_LSA,si_from_ptr((volatile void*)cSource));
	si_wrch(MFC_EAL,si_from_uint(cDest));
	si_wrch(MFC_Size,si_from_uint(4));
	si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//start asynchronous transfer back
	IF(cSize > 4, 1)
	{
		si_wrch(MFC_LSA,si_from_ptr((volatile void*)(uintptr_t)((uint32)cSource+4)));
		si_wrch(MFC_EAL,si_from_uint(cDest+4));
		si_wrch(MFC_Size,si_from_uint(4));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//start asynchronous transfer back
	}
	IF(cSize == 12, 0)
	{
		si_wrch(MFC_LSA,si_from_ptr((volatile void*)(uintptr_t)((uint32)cSource+8)));
		si_wrch(MFC_EAL,si_from_uint(cDest+8));
		si_wrch(MFC_Size,si_from_uint(4));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//start asynchronous transfer back
	}
}

ILINE void ResetLocalGcmContext()
{
	//set to same offset as it is right now on PPU
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	const uint32 cLocalContextOffset	= ((uint32)(void*)pGcmSPUData->pGlobalPPUContext->current) & 15;
	pGcmSPUData->localContextOffset		= cLocalContextOffset;
	pGcmSPUData->contextData.current	= (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[cLocalContextOffset];
}

void cellGcmUpdateGlobalPPUContext()
{
	//transfer current PPU context here to set the local one to the same offset
	//also incorporates ResetLocalGcmContext
	const uint32 cEA = ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmGlobalPPUContext;
	//not aligned on qw address
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	MemcpyLS(pGcmSPUData->globalPPUContextMem, cEA & ~15, sizeof(CellGcmLocalContextData)*2, g_scMemCpyTempTag);
	pGcmSPUData->pGlobalPPUContext = (CellGcmContextData*)(void*)&((uint8*)pGcmSPUData->globalPPUContextMem)[cEA & 15];
	SyncMemory(g_scMemCpyTempTag);//sync transfer
	pGcmSPUData->lastUsedSegment = Segment((uint32)pGcmSPUData->pGlobalPPUContext->current-((uint32)pGcmSPUData->pGlobalPPUContext->begin - CELL_GCM_INIT_STATE_OFFSET));
	//if the command buffer has not been flushed yet, just update main memory addresses if last 4 bits fit
	//this relies on the fact that the order of commands does not matter
	const uint32 cLocalContextOffset		= pGcmSPUData->localContextOffset;
	const uint32 cNewLocalContextOffset = (uint32)(void*)pGcmSPUData->pGlobalPPUContext->current;
	pGcmSPUData->localContextOffset			= cNewLocalContextOffset & 15;
	IF(pGcmSPUData->contextData.current	!= (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[cLocalContextOffset], 0)
	{
		IF((cLocalContextOffset & 15) != (pGcmSPUData->localContextOffset & 15), 0)
		{
			//need to recopy the command buffer
			const uint32 cFilledSize = 	(uint32)pGcmSPUData->contextData.current - ((uint32)pGcmSPUData->contextData.begin + cLocalContextOffset);
			uint32 localCopy[cFilledSize/4];
			uint32 *pLocalCmdBuf = (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[cLocalContextOffset];
			for(uint32 i=0; i<cFilledSize/4; ++i)
				localCopy[i] = pLocalCmdBuf[i];
			pLocalCmdBuf = (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[pGcmSPUData->localContextOffset];
			for(uint32 i=0; i<cFilledSize/4; ++i)
				pLocalCmdBuf[i] = localCopy[i];
			pGcmSPUData->contextData.current	= (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[pGcmSPUData->localContextOffset + cFilledSize];
		}
	}
	else	
		pGcmSPUData->contextData.current	= (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[pGcmSPUData->localContextOffset];
	IF(pGcmSPUData->reportAreaLocSet == 0, 0)
	{
		//set report location for 1st invocation
		cell::Gcm::UnsafeInline::cellGcmSetZpassPixelCountEnable((CellGcmContextData*)&pGcmSPUData->contextData, CELL_GCM_TRUE);
		cell::Gcm::UnsafeInline::cellGcmSetReportLocation((CellGcmContextData*)&pGcmSPUData->contextData, CELL_GCM_LOCATION_MAIN);
		pGcmSPUData->reportAreaLocSet = 1;
	}
}

void cellGcmAddRSXWaitTicks(const uint32 cStart, const uint32 cTicks)
{
	GetGcmSPUData()->rsxWaitTime += (cStart - cTicks);
}

void cellGcmAddPerfTicks0(const uint32 cTicks)
{
	GetGcmSPUData()->perfTime0 += cTicks;
}

void cellGcmAddPerfTicks1(const uint32 cTicks)
{
	GetGcmSPUData()->perfTime1 += cTicks;
}

void cellGcmAddPerfTicks2(const uint32 cTicks)
{
	GetGcmSPUData()->perfTime2 += cTicks;
}

void cellGcmAddPerfTicks3(const uint32 cTicks)
{
	GetGcmSPUData()->perfTime3 += cTicks;
}

void cellGcmAddRSXStallTicks(const uint32 cStart, const uint32 cTicks)
{
	GetGcmSPUData()->rsxWaitTime += (cStart - cTicks);
}

uint32 cellGcmGetAndResetRSXWaitTicks()
{
	const uint32 cWaitTicks = GetGcmSPUData()->rsxWaitTime;
	GetGcmSPUData()->rsxWaitTime = 0;
	return cWaitTicks;
}

uint32 cellGcmGetAndResetPerfTicks0()
{
	const uint32 cTicks = GetGcmSPUData()->perfTime0;
	GetGcmSPUData()->perfTime0 = 0;
	return cTicks;
}

uint32 cellGcmGetAndResetPerfTicks1()
{
	const uint32 cTicks = GetGcmSPUData()->perfTime1;
	GetGcmSPUData()->perfTime1 = 0;
	return cTicks;
}

uint32 cellGcmGetAndResetPerfTicks2()
{
	const uint32 cTicks = GetGcmSPUData()->perfTime2;
	GetGcmSPUData()->perfTime2 = 0;
	return cTicks;
}

uint32 cellGcmGetAndResetPerfTicks3()
{
	const uint32 cTicks = GetGcmSPUData()->perfTime3;
	GetGcmSPUData()->perfTime3 = 0;
	return cTicks;
}

ILINE const bool HandlePPUCommandBufferChanges()
{
	//calculate new current-of PPU command buffer, pay attention that it might reach the end
	//it is assumed that the size is a multiple of 16
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	const uint32 cLocalContextOffset = pGcmSPUData->localContextOffset;
	uint32 filledSize = (uint32)pGcmSPUData->contextData.current - ((uint32)pGcmSPUData->contextData.begin + cLocalContextOffset);
	IF(filledSize == 0, 0)
		return false;
//#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	IF((uint32)pGcmSPUData->contextData.current > (uint32)pGcmSPUData->contextData.end, 0)
	{
		printf("cmd buf overflow (%d kb)\n",((uint32)pGcmSPUData->contextData.end - (uint32)pGcmSPUData->contextData.begin) >> 10);
		SPU_DEBUG_BREAK_ALWAYS;
	}
//#endif
	CellGcmContextData *const pGlobPPUContext = pGcmSPUData->pGlobalPPUContext;
	uint8 *const __restrict pLocalCMDBuf = pGcmSPUData->pLocalCmdBuffer;
	const uint32 cGlobPPUEnd		= (uint32)(void*)pGlobPPUContext->end;
	const uint32 cGlobPPUCur		= (uint32)(void*)pGlobPPUContext->current;
	const uint32 cMainBufStart = (uint32)pGcmSPUData->pGlobalPPUContext->begin - CELL_GCM_INIT_STATE_OFFSET;
	//set jump command to command buffer start if we have to switch to begin
	bool wrapping = false;
	uint32 bytesJumpAdded = 0;
#ifdef USE_FIXED_WRAP
	IF((cGlobPPUCur - cMainBufStart + filledSize)+GCM_CMD_SEGMENTSIZE >= GCM_CMD_SIZE, 0)
#else
	IF((cGlobPPUCur - cMainBufStart)+GCM_CMD_SEGMENTSIZE >= GCM_CMD_SIZE, 0)
#endif
	{
		const uint32 cCurPrev = (uint32)pGcmSPUData->contextData.current;
		cell::Gcm::UnsafeInline::cellGcmSetJumpCommand((CellGcmContextData*)&pGcmSPUData->contextData, (std::uint32_t)GCM_CMD_START_OFFSET);
		bytesJumpAdded = (uint32)pGcmSPUData->contextData.current - cCurPrev;
		filledSize += bytesJumpAdded;
		wrapping = true;
#ifndef USE_FIXED_WRAP
		bytesJumpAdded = 0;
#endif
	}
	//transfer all command buffer changes back, pay attention that 4 byte boundaries are handled correctly
	uint32 bytesLeft = filledSize;
	const uint32 cBytesFirst = (bytesLeft>(16-cLocalContextOffset))?(16-cLocalContextOffset) : bytesLeft;
	uint32 curPPUEA = (uint32)(void*)pGlobPPUContext->current;
	MemcpyMainAllSize(curPPUEA, (uint32)&pLocalCMDBuf[cLocalContextOffset], cBytesFirst);
	bytesLeft -= cBytesFirst;
	uint32 bytesCopied = cBytesFirst;
	curPPUEA += cBytesFirst;
	IF(bytesLeft >= 16, 1)
	{
		const uint32 cToCopy = bytesLeft & ~15;
		MemcpyMainAllSize(curPPUEA, (uint32)&pLocalCMDBuf[cLocalContextOffset + bytesCopied], cToCopy);
		bytesCopied += cToCopy;
		curPPUEA	  += cToCopy;
		bytesLeft   -= cToCopy;
	}
	if(bytesLeft)
	{
		MemcpyMainAllSize(curPPUEA, (uint32)&pLocalCMDBuf[cLocalContextOffset + bytesCopied], bytesLeft);
		curPPUEA += bytesLeft;
	}
	//update PPU gcm context (current), transfer back fenced
	pGlobPPUContext->current = (uint32*)(void*)(curPPUEA - bytesJumpAdded);
	const uint32 cEA = ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmGlobalPPUContext;
	//not aligned on qw address
	MemcpyMain
	(
		cEA + offsetof(CellGcmLocalContextData, current), 
		(NSPU::TAddrLS)((uint8*)pGlobPPUContext + offsetof(CellGcmLocalContextData, current)), 
		4, g_scDMAListTag
	);
	return wrapping;
}

uint8* cellGcmGetPSBuf()
{
	return (uint8*)GetGcmSPUData()->pLocalPSBuffer;
}

uint8* cellGcmGetVSBuf()
{
	return (uint8*)GetGcmSPUData()->pLocalVSBuffer;
}

uint8* cellGcmCpyUCodeLS(void* const __restrict pMainUCode, const uint32 cUCodeSize, const uint32 cOff)
{
	//copy per DMA u code here and sync transfer
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
//#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	IF(cUCodeSize > pGcmSPUData->localPSBufferSize, 0)
	{
		printf("PS ucode(size: %d KB) too large(max %d KB)\n",cUCodeSize>>10, pGcmSPUData->localPSBufferSize>>10);
//		SPU_DEBUG_BREAK_ALWAYS;
	}
//#endif
	uint8* const __restrict pLSDest = pGcmSPUData->pLocalPSBuffer + cOff;
	MemcpyLS((void*)pLSDest, (uint32)pMainUCode, cUCodeSize, g_scDMAUCodeTag);
	return pLSDest;
}

uint8* cellGcmCpyVertexCodeLS(void* const __restrict pMainUCode, const uint32 cUCodeSize, const uint32 cOff)
{
	//copy per DMA u code here and sync transfer
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
//#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	IF(cUCodeSize > pGcmSPUData->localVSBufferSize, 0)
	{
		printf("VS code(size: %d KB) too large(max %d KB)\n",cUCodeSize>>10, pGcmSPUData->localVSBufferSize>>10);
//		SPU_DEBUG_BREAK_ALWAYS;
	}
//#endif
	uint8* const __restrict pLSDest = pGcmSPUData->pLocalVSBuffer + cOff;
	MemcpyLargeLS((void*)pLSDest, (uint32)pMainUCode, cUCodeSize, g_scDMAVertexCodeTag, false);
	return pLSDest;
}

void cellGcmCpyUCodeMain
(
	void* const __restrict pRSXLoc, 
	const void* const __restrict cpMainUCode, 
	const uint32 cUCodeSize
)
{
	//copy back from local store to RSX and main in parallel
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	uint8* const __restrict pLSDest = pGcmSPUData->pLocalPSBuffer;
	MemcpyMainBarrier((uint32)pRSXLoc, (void*)pLSDest, cUCodeSize, g_scDMAListTag);//make sure u code transfer has finished
	MemcpyMain((uint32)cpMainUCode, (void*)pLSDest, cUCodeSize, g_scDMAListTag);
#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	IF(pGcmSPUData->psBufferTransferInFlight, 0)
	{
		printf("cellGcmCpyUCodeMain already in progress\n");
		SPU_DEBUG_BREAK_ALWAYS;
	}
	pGcmSPUData->psBufferTransferInFlight = 1;
#endif
}

void cellGcmCpyUCodeMainFromLS
(
	void* const __restrict pRSXLoc, 
	const void* const __restrict cpLSUCode, 
	const uint32 cUCodeSize
)
{
	//copy back from local store to RSX and main in parallel
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	MemcpyMainBarrier((uint32)pRSXLoc, (void*)cpLSUCode, cUCodeSize, g_scDMAListTag);//make sure u code transfer has finished
}

void cellGcmInitLocalGcmContext
(
	CellGcmLocalContextData *__restrict* ppThisContext,
	CellGcmSPUData *__restrict pGcmSPUData,
	uint8_t *__restrict pLocalCmdBuf, 
	const uint32_t cLocalCmdSize, 
	uint8_t *__restrict pLocalPSBuf,
	const uint32_t cLocalPSSize,
	uint8_t *__restrict pLocalVSBuf,
	const uint32_t cLocalVSSize
)
{
	//mark whole rsx memory as async
	const uint32 cAddressBase	= RSX_ADDRESS_BASE;//RSX address
	__cache_range_write_async(cAddressBase, cAddressBase + RSX_RANGE);
	const NSPU::SPageDirInfo& __restrict crPageInfo = *((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO);
	SetGcmSPUData(pGcmSPUData);
	*ppThisContext = &pGcmSPUData->contextData;
	pGcmSPUData->lastUsedSegment = 0;
	pGcmSPUData->pLocalCmdBuffer				= pLocalCmdBuf;
	pGcmSPUData->contextData.begin			= (uint32_t *)pLocalCmdBuf;
	pGcmSPUData->contextData.end				= (uint32_t *)&pLocalCmdBuf[cLocalCmdSize];
	pGcmSPUData->contextData.callback		= NULL;
	pGcmSPUData->pLocalPSBuffer					= pLocalPSBuf;
	pGcmSPUData->localPSBufferSize			= cLocalPSSize;
	pGcmSPUData->pLocalVSBuffer					= pLocalVSBuf;
	pGcmSPUData->localVSBufferSize			= cLocalVSSize;
#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	pGcmSPUData->psBufferTransferInFlight = 0;
#endif
	pGcmSPUData->localContextOffset			= 0;
	pGcmSPUData->contextData.current	  = (uint32_t*)&pGcmSPUData->pLocalCmdBuffer[0];
	pGcmSPUData->rsxWaitTime						= 0;
	pGcmSPUData->perfTime0							= 0;
	pGcmSPUData->perfTime1							= 0;
	pGcmSPUData->perfTime2							= 0;
	pGcmSPUData->perfTime3							= 0;
	for(uint32 a=0;a<GCM_CMD_SEGMENTCOUNT;a++)
		pGcmSPUData->SegmentHandle[a]			= 0;
	pGcmSPUData->reportAreaLocSet				= 0;
	pGcmSPUData->rsxPushOffset					= 0;
}

int32_t cellGcmAddressToOffset(const void* address, uint32_t *__restrict offset)
{
	const uint32 cEA = (uint32)address;
	IF(cEA >= RSX_ADDRESS_BASE && cEA < (RSX_ADDRESS_BASE + RSX_RANGE), true)
	{
		//memory resides on RSX
		*offset = (uint32)address - RSX_ADDRESS_BASE;
	}
	else
	{
//#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
		{	
			printf("cellGcmAddressToOffset: bad offset for ea=0x%08x\n",cEA);
			SPU_DEBUG_BREAK_ALWAYS;
			return -1;
		}
//#endif
	}
	return CELL_OK;
}

//location is always supposed to be main memory
/*uint64 cellGcmGetTimeStampLocation(const uint32 index)
{
	const uint32 cReportBaseEA = (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->reportEA);
	uint8 cacheLine[128] _ALIGN(128);
	const uint32 cReportEA = cReportBaseEA + index * 16;
	const uint32 cReportEAAligned = cReportEA & ~127;
	MemcpyLS((void*)cacheLine, cReportEAAligned, 128, g_scMemCpyTempTag);
	SyncMemory(g_scMemCpyTempTag);
	return *((uint64*)&cacheLine[cReportEA & 127]);
}
*/

uint64 cellGcmSyncToRSX(const uint64 Handle, uint32& rZWriteCount, bool recordWaitTime, bool isResourceSync, int timeOutMs = 500/*ms*/)
{
	const NSPU::SPageDirInfo& __restrict crPageInfo = *((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO);
	uint8 cacheLineSyncMan[128] _ALIGN(128);
	SPUSyncAtomicDCache();
	mfc_prep((void*)cacheLineSyncMan, crPageInfo.gcmCountDeviceThreadEA);
	mfc_getllar_again();
	mfc_read_atomic_status();
	uint64 *const __restrict pCountDeviceThreadLS		= (uint64*)cacheLineSyncMan;
	uint64 *const __restrict pSwapRSXLS	= pCountDeviceThreadLS + 1;
	uint64 curSwapRSX = *pSwapRSXLS;
	const uint32 Idx	=	StampOffset(Handle);
	uint8 cacheLine[128] _ALIGN(128);
	const uint32 cReportBaseEA = (((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->reportEA);
	const uint32 cReportEA = cReportBaseEA + Idx * 16;
	const uint32 cReportEAAligned = cReportEA & ~127;
	volatile CellGcmReportData *const __restrict pReport = (CellGcmReportData*)&cacheLine[cReportEA & 127];
	if(curSwapRSX >= Handle)
	{
		mfc_prep((void*)cacheLine, cReportEAAligned);
		mfc_getllar_again();
		mfc_read_atomic_status();
		rZWriteCount = (uint32)pReport->value;
		return *pCountDeviceThreadLS;
	}
	
	const uint32 cWaitStart = rdtsc();
	const uint32 cWaitTimeOutEnd = cWaitStart - timeOutMs * 79800;//convert to ticks per ms

#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	uint32 iterCount = 0;
#endif

	do
	{
		mfc_prep((void*)cacheLine, cReportEAAligned);
		mfc_getllar_again();
		mfc_read_atomic_status();
		rZWriteCount = (uint32)pReport->value;
#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
		IF(++iterCount > 4096*128, 0)
		{
			printf("deadlock in cellGcmFlush/cellGcmSyncToRSX: exceeded max iteration count\n");
			SPU_DEBUG_BREAK_ALWAYS;
		}
#endif
		IF(rZWriteCount==INVALID_Z_COUNT, 0)//otherwise while condition cant be true
		{
			mfc_prep((void*)cacheLineSyncMan, crPageInfo.gcmCountDeviceThreadEA);
			mfc_getllar_again();
			mfc_read_atomic_status();
			curSwapRSX = *((volatile uint64*)pSwapRSXLS);//force reload
		}
		IF(rdtsc() < cWaitTimeOutEnd, 0)
		{
			rZWriteCount = INVALID_Z_COUNT;
			printf("Timeout(%d) in cellGcmSyncToRSX\n",timeOutMs );
			return ~0;
		}
	}
	WHILE(rZWriteCount==INVALID_Z_COUNT && curSwapRSX < Handle, 0);
#if defined(SUPP_SPU_FRAME_STATS)
	if(recordWaitTime)
	{
		if(isResourceSync)
			cellGcmAddRSXStallTicks(cWaitStart, rdtsc());
		else
			cellGcmAddRSXWaitTicks(cWaitStart, rdtsc());
	}
#endif
	return *pCountDeviceThreadLS;
}

void cellGcmCpySyncVertexCode()
{
	const uint32 cStart = rdtsc();
	SyncMemory(g_scDMAVertexCodeTag);
	cellGcmAddPerfTicks1(cStart - rdtsc());
}

void cellGcmSyncUCodeLS()
{
	const uint32 cStart = rdtsc();
	SyncMemory(g_scDMAUCodeTag);
	cellGcmAddPerfTicks0(cStart - rdtsc());
}

void cellGcmFlush()
{
	//inlined: uint32* cellGcmGetControlRegister() (to transfer current rsx put pointer here)
	const uint32 cControlRegEA = ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmGlobalPPUControlReg;
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	const uint32 cEA = ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmGlobalPPUContext;
	uint32 buf[sizeof(CellGcmLocalContextData)/2] _ALIGN(16);
	MemcpyLS(buf, cEA & ~15, sizeof(CellGcmLocalContextData)*2, g_scMemCpyTempTag);
	CellGcmLocalContextData* testPPU = (CellGcmLocalContextData*)(void*)&((uint8*)buf)[cEA & 15];
	SyncMemory(g_scMemCpyTempTag);
	const uint32 cTestPPUCurrent = (uint32)testPPU->current;
	const uint32 cGlobalPPUContextCurrent = (uint32)pGcmSPUData->pGlobalPPUContext->current;
	if(cTestPPUCurrent != cGlobalPPUContextCurrent)
	{
		const uint32 cWrittenBytes = cTestPPUCurrent - cGlobalPPUContextCurrent;
		printf("cmd buffer corrupted from PPU (PPU current=0x%08x  %d bytes written by PPU):\n   ",cTestPPUCurrent,cWrittenBytes);
		//copy written bytes here to allow investigation:
		uint8 writtenBytes[cWrittenBytes + 32] _ALIGN(16);
		MemcpyLS(writtenBytes, cGlobalPPUContextCurrent & ~15, (cWrittenBytes + 32) & ~15, g_scMemCpyTempTag);
		SyncMemory(g_scMemCpyTempTag);
		uint32 *pCurCmd = (uint32*)&writtenBytes[cGlobalPPUContextCurrent & 15];
		for(uint32 i=0; i<(cWrittenBytes/4); ++i)
			printf("0x%08x\n",pCurCmd[i]);
		SPU_DEBUG_BREAK_ALWAYS;
	}
#endif
	uint32 *pControlReg = (uint32*)&((uint8*)pGcmSPUData->RSXPutPtrArea)[cControlRegEA & 15];
	IF(GCM_CMD_START_OFFSET != CELL_GCM_INIT_STATE_OFFSET,1)
		cell::Gcm::UnsafeInline::cellGcmSetCallCommand((CellGcmContextData*)&pGcmSPUData->contextData, (std::uint32_t)GCM_CMD_INJECTION_OFFSET + pGcmSPUData->rsxPushOffset);
	const bool cWrapped = HandlePPUCommandBufferChanges();	//write back command buffer changes, update local ppu context
	//convert new rsx put pointer: implements:	cellGcmAddressToOffset(pGcmSPUData->pGlobalPPUContext->current, pControlReg);
	const uint32 cMainCmdBufStart = (uint32)pGcmSPUData->pGlobalPPUContext->begin;
	const uint32 cAddressBase	= ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmCmdAddressBase;
	*pControlReg = ((uint32)pGcmSPUData->pGlobalPPUContext->current - cMainCmdBufStart) + cAddressBase;
	//ps shader transfer is syncd by MFC using barrier on g_scDMAListTag
#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
	pGcmSPUData->psBufferTransferInFlight = 0;
#endif
	//transfer new rsx put pointer back
	MemcpyMainBarrier(cControlRegEA, pControlReg, 4, g_scDMAListTag);
	//implement flush functionality as in CCryDXPSGCMCMBMan::FlushCMDs()
	const uint32 cMainBufStart = cMainCmdBufStart - CELL_GCM_INIT_STATE_OFFSET;
	const uint32 Current =	(uint32)pGcmSPUData->pGlobalPPUContext->current - cMainBufStart;
	const uint32 CurrentSeg	=	Segment(Current);
	const uint32 cLastUsedSegment = pGcmSPUData->lastUsedSegment;
	if(CurrentSeg != cLastUsedSegment)	//current crosses segments?
	{
		const uint32 Next				=	(Current+GCM_CMD_SEGMENTSIZE>=GCM_CMD_SIZE)?GCM_CMD_START_OFFSET:Current+GCM_CMD_SEGMENTSIZE;
		const uint32 NextSeg		=	Segment(Next);
		uint32 zWriteCount;
		pGcmSPUData->SegmentHandle[cLastUsedSegment] = cellGcmSyncToRSX(pGcmSPUData->SegmentHandle[NextSeg], zWriteCount, true, false);//set most recent handle for last segment
		pGcmSPUData->lastUsedSegment = CurrentSeg;
	}
#ifdef USE_FIXED_WRAP
	IF(cWrapped, 0)
	{
		const uint32 cEA = ((NSPU::SPageDirInfo*)(void*)G_SPU_PAGE_DIR_INFO)->gcmGlobalPPUContext;
		CellGcmContextData *const pGlobPPUContext = pGcmSPUData->pGlobalPPUContext;
		//update gCellGcmCurrentContext->current on ppu
		pGlobPPUContext->current = (uint32*)((uint32)pGlobPPUContext->begin+GCM_CMD_START_OFFSET-CELL_GCM_INIT_STATE_OFFSET);
		MemcpyMainBarrier
		(
			cEA + offsetof(CellGcmLocalContextData, current), 
			(NSPU::TAddrLS)((uint8*)pGlobPPUContext + offsetof(CellGcmLocalContextData, current)), 
			4, g_scDMAListTag
		);
		//set put ptr on rsx
		*pControlReg = cAddressBase+GCM_CMD_START_OFFSET-CELL_GCM_INIT_STATE_OFFSET;
		MemcpyMainFenced(cControlRegEA, pControlReg, 4, g_scDMAListTag);
	}
#endif
	ResetLocalGcmContext();//reset local gcm context 
	//sync is done outside flush
//#if !defined(_NO_SPU_ASSERT) || defined(FORCE_ASSERT)
//	SyncMemory(g_scDMAListTag);//sync fenced transfer
//#endif
}

int32_t cellGcmSetFlip(const uint8_t cBufID)
{
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	uint32 *__restrict const pCurCmdBuffer = pGcmSPUData->contextData.current;
	//write the hexcodes required for flip
	pCurCmdBuffer[0]	= 0x0004e944;	pCurCmdBuffer[1]	= cBufID;			pCurCmdBuffer[2]	= 0x00040060; 
	pCurCmdBuffer[3]	= 0x56616661;	pCurCmdBuffer[4]	= 0x00040064;	pCurCmdBuffer[5]	= 0x00000030; 
	pCurCmdBuffer[6]	= 0x0004006c;	pCurCmdBuffer[7]	= 0x00000000;	pCurCmdBuffer[8]	= 0x00040064; 
	pCurCmdBuffer[9]	= 0x00000030;	pCurCmdBuffer[10] = 0x00040068;	pCurCmdBuffer[11] = 0x00000001;	
	pCurCmdBuffer[12] = 0x00000002;	pCurCmdBuffer[13] = 0x00040064;	pCurCmdBuffer[14] = 0x00000010;	
	pCurCmdBuffer[15] = 0x0004006c;	pCurCmdBuffer[16] = 0xffffffff;	pCurCmdBuffer[17] = 0x0004e924;	
	pCurCmdBuffer[18] = 0x8000001f;
	pGcmSPUData->contextData.current += 76/sizeof(uint32);
	return CELL_OK;
}

#undef CELL_GCM_CURRENT
#define CELL_GCM_CURRENT pCurCmdBuffer
void cellGcmSetVertexProgram(const uint32_t* prog, const void * __restrict ucode)
{
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	uint32 *__restrict pCurCmdBuffer = pGcmSPUData->contextData.current;
	const CgBinaryProgram *vs = (const CgBinaryProgram*) prog;
	CgBinaryVertexProgram *binaryVertexProgram = (CgBinaryVertexProgram*)((uint32_t*)prog + (vs->program >> 2));
	const uint32_t *rawData = (const uint32_t*)ucode;
	uint32_t instCount = binaryVertexProgram->instructionCount;
	uint32_t instIndex = binaryVertexProgram->instructionSlot;
	uint32_t div = instCount / 8;
	uint32_t res = (instCount % 8) * 4;
	CELL_GCM_METHOD_SET_TRANSFORM_PROGRAM_LOAD_START(CELL_GCM_CURRENT, instIndex, instIndex);
	uint32_t i, j;
	for (i = 0; i < div; i++)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, 32);
		CELL_GCM_MEMCPY(&CELL_GCM_CURRENT[1], &rawData[0], sizeof(uint32_t)*32);
		CELL_GCM_CURRENT += (1 + 32);
		rawData += 32;
	}
	if (res > 0)
	{
		CELL_GCM_CURRENT[0] = CELL_GCM_METHOD(CELL_GCM_NV4097_SET_TRANSFORM_PROGRAM, res);
		for (j = 0; j < res; j++)
			CELL_GCM_CURRENT[j+1] = CELL_GCM_ENDIAN_SWAP(rawData[j]);
		CELL_GCM_CURRENT += (1 + res);
	}
	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_INPUT_MASK(CELL_GCM_CURRENT, binaryVertexProgram->attributeInputMask);
	CELL_GCM_METHOD_SET_TRANSFORM_TIMEOUT(CELL_GCM_CURRENT, 0xFFFF, (binaryVertexProgram->registerCount <= 32)?32:48);
	pGcmSPUData->contextData.current = pCurCmdBuffer;
}

void cellGcmSetSurfaceWindow(const CellGcmSurface *surface, const uint32_t origin, const uint32_t pixelCenter)
{
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	uint32 *__restrict pCurCmdBuffer = pGcmSPUData->contextData.current;
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_A(CELL_GCM_CURRENT,	CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[0]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_B(CELL_GCM_CURRENT,	CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[1]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_COLOR_C_D(CELL_GCM_CURRENT,	CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[2],	CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->colorLocation[3]);
	CELL_GCM_METHOD_SET_CONTEXT_DMA_Z(CELL_GCM_CURRENT,	CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER+surface->depthLocation);
	const uint32_t cWidth = (uint32_t)surface->width;
	const uint32_t cHeight = (uint32_t)surface->height;
	uint32_t log2Width  = 31 - spu_extract(spu_cntlz(spu_promote(cWidth, 0)), 0);
	uint32_t log2Height = 31 - spu_extract(spu_cntlz(spu_promote(cHeight, 0)), 0);
	CELL_GCM_METHOD_SET_SURFACE_FORMAT_PITCH_A_B_OFFSET_A_B_Z(CELL_GCM_CURRENT, surface->colorFormat, surface->depthFormat, surface->antialias, surface->type, log2Width, log2Height,
		surface->colorPitch[0],
		surface->colorOffset[0],
		surface->depthOffset,
		surface->colorOffset[1],
		surface->colorPitch[1]);
	CELL_GCM_METHOD_SET_SURFACE_PITCH_Z(CELL_GCM_CURRENT, surface->depthPitch);
	CELL_GCM_METHOD_SET_SURFACE_PITCH_C_D_OFFSET_C_D(CELL_GCM_CURRENT, surface->colorPitch[2],surface->colorPitch[3],surface->colorOffset[2],surface->colorOffset[3]);
	CELL_GCM_METHOD_SET_SURFACE_COLOR_TARGET(CELL_GCM_CURRENT, CELL_GCM_COMMAND_CAST(surface->colorTarget));
	CELL_GCM_METHOD_SET_WINDOW_OFFSET(CELL_GCM_CURRENT, surface->x, surface->y);
	CELL_GCM_METHOD_SET_SURFACE_CLIP_HORIZONTAL_VERTICAL(CELL_GCM_CURRENT, surface->x, surface->width,surface->y, surface->height);
	CELL_GCM_METHOD_SET_SHADER_WINDOW(CELL_GCM_CURRENT,surface->height - (((surface->height) & 0x1000) >> 12), origin, pixelCenter);
	pGcmSPUData->contextData.current = pCurCmdBuffer;
}

void cellGcmSetVertexDataArray(uint8_t index, uint16_t frequency, uint8_t stride, uint8_t size, uint8_t type, uint8_t location, uint32_t offset)
{
	CellGcmSPUData* const __restrict pGcmSPUData = GetGcmSPUData();
	uint32 *__restrict pCurCmdBuffer = pGcmSPUData->contextData.current;
	uint32_t *ptr = CELL_GCM_CURRENT;
	uint32_t *nptr = CELL_GCM_CURRENT + 4;
	uint32_t ptroffset = (uint32_t)ptr & 0xf;
	vec_uint4 *vptr0 = (vec_uint4*)((uintptr_t)ptr);
	vec_uint4 *vptr1 = (vec_uint4*)((uintptr_t)nptr);
	vec_uint4 dstVec0 = *vptr0;
	vec_uint4 dstVec1 = *vptr1;
	CELL_GCM_CURRENT = nptr; 
	vec_uint4 src0 = (vec_uint4){CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA_ARRAY_FORMAT + (index) * 4, 1),
								 (((frequency) << 16) | ((stride) << 8) | ((size) << 4) | (type)),
								 CELL_GCM_METHOD(CELL_GCM_NV4097_SET_VERTEX_DATA_ARRAY_OFFSET + (index) * 4, 1),
								 (((location) << 31) | (offset))};
	vec_uint4 mask = (vec_uint4)spu_splats(0xffffffff);
	vec_uint4 mask0 = (vec_uint4)spu_rlmaskqwbyte(mask, -ptroffset);
	vec_uint4 val0 = spu_rlmaskqwbyte(src0, -ptroffset);
	vec_uint4 val1 = spu_slqwbyte(src0, 16 - ptroffset);
	*vptr0 = spu_sel(dstVec0, val0, mask0);
	*vptr1 = spu_sel(val1, dstVec1, mask0);
	pGcmSPUData->contextData.current = pCurCmdBuffer;
}

#undef CELL_GCM_CURRENT

#endif //__SPU__
#endif //PS3
