#include "StdAfx.h"
#include "../Layer0/CCryDXPS.hpp"
#include "CCryDXPSRDThread.hpp"

#if !defined(__SPU__)
	extern volatile int g_FlipSpinLock;

//DECLARE_SPU_JOB("MemsetSPULarge", TMemsetJob)

inline CDXPSRDThread::~CDXPSRDThread()
{
  Cancel();
	WaitForThread();
	READ_WRITE_BARRIER
}

extern NPPU::SFrameProfileRSXData& GetFrameStatsSPUThread();
static uint32 g_RSXWaitTime		= 0;
static uint32 g_RSXStallTime	= 0;
static uint32 g_DeviceStallTime	= 0;
uint64 g_StallRecords[32] = {0};//one record per thread
extern uint32 MapThreadIDToIndex(uint32);

void cellGcmAddRSXWaitTicks(const uint64 cStart, const uint64 cEnd)
{
	uint64 diff = (cEnd - cStart);
	g_RSXWaitTime += (uint32)diff;
}

void cellGcmAddRSXStallTicks(const unsigned long long cStart, const unsigned long long cEnd)
{
	uint64 diff = (cEnd - cStart);
	if(g_bProfilerEnabled)
		g_StallRecords[MapThreadIDToIndex((uint32)GetCurrentThreadId())] = diff;
	g_RSXStallTime += (uint32)diff;
}

void cellGcmAddDeviceStallTicks(const unsigned long long cStart, const unsigned long long cEnd)
{
	uint64 diff = (cEnd - cStart);
	if(g_bProfilerEnabled)
		g_StallRecords[MapThreadIDToIndex((uint32)GetCurrentThreadId())] = diff;
	g_DeviceStallTime += (uint32)diff;
}

void cellGcmGetAndResetRSXWaitTicks()
{
	NPPU::SFrameProfileRSXData& rProfData = GetFrameStatsSPUThread();
	const float cInvTB		= 1000.f / 79800.f;//inverse ticks per usec
	rProfData.rsxWaitTime = (uint32)((float)g_RSXWaitTime * cInvTB);
	rProfData.flushTime		= (uint32)((float)g_RSXStallTime * cInvTB);
	g_RSXWaitTime		= 0;
	g_RSXStallTime	= 0;
	g_DeviceStallTime = 0;
//		memset(g_StallRecords, 0, sizeof(g_StallRecords));
}

//	#define DABR_PPU
#if defined(DABR_PPU)
	uint32 g_DABRPPU = 0;//use debugger to set dma control check ad hoc
#endif

#if !defined(_RELEASE) // disabled DIP functionality in RELEASE
#include <sys/gpio.h>
//return DIP value for switch 0
static inline uint64 GetDIPValue0()
{
	uint64 dipVal = 0;
	sys_gpio_get(SYS_GPIO_DIP_SWITCH_DEVICE_ID, &dipVal);
	return dipVal&1;
}
#endif !_RELEASE

static inline bool EnableVSync()
{
#if !defined(_RELEASE) // disabled DIP functionality in RELEASE
	//expect vsync to be false initially
	static const uint64 scInitialDipVal = GetDIPValue0();
	return scInitialDipVal == GetDIPValue0();
#else
	return true;
#endif
}

#if defined(CRY_DXPS_PERFORMANCECOUNTING)
CellGcmReportData*		g_pReport;
#endif

static void DXPSFlipIssued(const uint32_t)//param is always one without any meaning, regarding doc sdk220
{
	tdLayer0::Sync().SwapRSX();
	g_FlipVars.pFlipLockedTarget	=	0;
	SNSTOPMARKER(SNTM_FLIPPING);
}

static void DXPSVBIssued(const uint32_t Flag)//param is always one without any meaning, regarding doc sdk220
{
	using namespace CRY_DXPS_GCMNAMESPACE;

	//flip triggered?
	if(g_FlipVars.flipDrawCallID==TDRES_CREATE(0))
		return;
	uint32 Dummy;
	if(EnableVSync()!=(CRenderer::CV_r_vsync==Flag) || !tdLayer0::Sync().SyncToRSX<false,false>(g_FlipVars.flipDrawCallID,Dummy))
		return;

	SNSTOPMARKER(SNTM_FLIPREQUEST);
	SNSTARTMARKER(SNTM_FLIPPING,"FlippingInVBIssued");
	
	//there is a pending cellGcmSetPrepareFlip
	//execute on a dummy context to make the rsx control register frame id set correctly (only way to get working from spu)
	if(g_FlipVars.flipModeUsed == 0)
	{
		#define DUMMY_CONTEXT_SIZE (128)
		uint32_t dummyArea[DUMMY_CONTEXT_SIZE] _ALIGN(16);
		CellGcmContextData dummyContext = {dummyArea, &dummyArea[DUMMY_CONTEXT_SIZE], dummyArea, NULL};
		const int cFlipBufID = g_FlipVars.flipBufID;
		if(CELL_GCM_ERROR_FAILURE == cellGcmSetPrepareFlip(&dummyContext,cFlipBufID))
			printf("cellGcmSetPrepareFlip failed for ID=%d\n",cFlipBufID);
		#undef DUMMY_CONTEXT_SIZE
	}

	{
		CrySpinLock(&g_FlipSpinLock, 0, 1);		
		const int32_t cFrameID = g_FlipVars.flipFrameID;
	#if defined(CRY_MM_DEBUG_DEFRAG)
		if(CRenderer::CV_r_PS3VMemDefragDebug>0)
			tdLayer0::Memory().DrawDebug(CRenderer::CV_r_PS3VMemDefragDebug,g_FlipLockedTarget);
	#endif
		
		if(CELL_GCM_ERROR_FAILURE == cellGcmSetFlipImmediate(cFrameID))
		{
			if(g_FlipVars.flipFrameID != cFrameID)
				printf("cellGcmSetFlipImmediate flip frame mismatch(%d != %d)\n",g_FlipVars.flipFrameID,cFrameID);
			printf("cellGcmSetFlipImmediate failed for ID=%d\n",cFrameID);
			snPause();
		}
		g_FlipVars.flipDrawCallID =	TDRES_CREATE(0);
		READ_WRITE_BARRIER;
		__sync();
		g_FlipSpinLock = 0;
	}
}

void CDXPSRDThread::Init()
{
	m_CMDBCurrent	=	0;
	m_CMDBPut	=	0;
	m_CMDBGet	=	0;
	#if defined(CRY_DXPS_DEVICETHREAD) && !defined(_RELEASE)
		m_Waiting = 1;
		Start(0, "DXPSThread", THREAD_PRIORITY_NORMAL, SIMPLE_THREAD_STACK_SIZE_KB*1024);
	#endif//CRY_DXPS_DEVICETHREAD
	cellGcmSetFlipHandler(DXPSFlipIssued);
	cellGcmSetVBlankHandler(DXPSVBIssued);
	cellGcmSetUserHandler(DXPSVBIssued);
	m_Worker.InitPSCache();
}

#if defined(CRY_DXPS_DEVICETHREAD)
void CDXPSRDThread::Run()
{
#if !defined(_RELEASE)
	CryThreadSetName( -1, "DXPSRenderDevice" );
	CDXPSRDJobSPU* pJob;
	while(true)
	{
		volatile uint32 LoopCount=0;
		pJob	=	reinterpret_cast<CDXPSRDJobSPU*>(&m_CMDBuffer[m_CMDBGet]);
		while(m_CMDBGet==m_CMDBPut || (pJob->IsSPUJob() && (gPS3Env->spuEnabled != -1)))
		{
			LoopCount++;
			if(512<LoopCount)
			{
#if defined(CRY_DXPS_DOWNLOADABLE_VMEM)
				//do only omit flushes if spu is not running, 
				//	must not alter cmd buffer concurrently
				if(gPS3Env->spuEnabled != 1)
				{
					//do not alter cmd buffer when suspended (main thread sets f.i. tile info)
					uint32 curSuspend = m_Suspend;
					if(curSuspend == 3)
						break;//another exit edge
					if(curSuspend!=1)
					{
						if(curSuspend==2 || 0==(LoopCount&1023))	//~1ms
						{
							CDXPSRDJDummySPU Job;
							m_Worker.Job(Job,EDXPSJ_FLUSH);
							if(curSuspend == 2)
								m_Suspend = 0;
						}
					}
					sys_timer_usleep(1);
				}
				else
#endif
				{
					m_LockNotify.Lock();
					if(m_CMDBGet==m_CMDBPut || (pJob->IsSPUJob() && (gPS3Env->spuEnabled != -1)))
					{
						m_Waiting = 1;
						READ_WRITE_BARRIER
						m_Condition.Wait(m_LockNotify);
					}
					m_LockNotify.Unlock();
				}
			}
			else
			{
				__db16cycl__
				__db16cycl__
				__db16cycl__
				__db16cycl__
			}
			pJob	=	reinterpret_cast<CDXPSRDJobSPU*>(&m_CMDBuffer[m_CMDBGet]);
		}
		m_Waiting = 0;
		const uint32 jobType		= pJob->FullTypeVolatile();
		const uint32 maskedType = jobType&EDXPSJOB_MASK;
		if((EDXPSJob)maskedType == EDXPSJ_EXIT || m_Suspend == 3)
		{
			NextJob<false>((uint32&)m_CMDBGet,pJob->Size());
			break;//enable possible exit of thread
		}
		m_Worker.WorkOn(pJob,(EDXPSJob)maskedType);
		NextJob<false>((uint32&)m_CMDBGet,pJob->Size());
	}
  // Done - has exited 
  m_Finished = 1; 
#endif//_RELEASE
}
#endif//CRY_DXPS_DEVICETHREAD
#endif//__SPU__

#if !defined(_RELEASE) || defined(__SPU__)
	uint8 g_LocalPSBuf[LOCAL_PS_BUFFER_SIZE+LOCAL_SHADER_GUARD_BUFFER_SIZE] _ALIGN(128) SPU_LOCAL;//ps buffer, also used for constant copying in SetVertexShader
	uint8 g_LocalVSBuf[LOCAL_VS_BUFFER_SIZE+LOCAL_SHADER_GUARD_BUFFER_SIZE] _ALIGN(128) SPU_LOCAL;//vs buffer
#endif
#if defined(__SPU__)
	ILINE void FetchPutGet(uint32 putEA, uint32 getEA, uint32 rsxEA, uint32& rPut, uint32& rGet, uint32& rRSXPushOff)
	{
		uint8 putGetAtomicLine[128] _ALIGN(128);
		const uint32 cPutEAAligned = (uint32)putEA & ~127;
		__spu_load_atomic_cacheline(SPU_MAIN_PTR((void*)cPutEAAligned), putGetAtomicLine, false);
		rPut = *(uint32*)&putGetAtomicLine[(uint32)putEA & 127];
		rGet = *(uint32*)&putGetAtomicLine[(uint32)getEA & 127];
		rRSXPushOff = *(uint32*)&putGetAtomicLine[(uint32)rsxEA & 127];
	}

	#define JOB_STACK_SIZE_MAX 256
	uint8 g_JobCopy[JOB_STACK_SIZE_MAX*2] _ALIGN(128) SPU_LOCAL;
	CDXPSRDWorker g_WorkerLS _ALIGN(128) SPU_LOCAL;
	CellGcmLocalContextData *__restrict thisContext SPU_DOMAIN_LOCAL SPU_LOCAL;
	CellGcmSPUData g_LocalContext SPU_LOCAL;
	uint8 g_LocalCMDBuf[LOCAL_SPU_CMD_BUF_SIZE] _ALIGN(128) SPU_LOCAL;

	//even that it has a possible exit should it never occur since it has been unregistered from the job queue and returning
	//  would lead to unregister a job which has been assigned the same position in the queue and info block
	//unregister is necessary since it would introduce a constant overhead in the PPU JobManager job queue among one slot less
	SPU_ENTRY(DXPSThreadSPU)
	void CDXPSRDThread::RunSPU()
	{
		bool ppuExecuted = true;//force a first init of cmd buffer
		CDXPSRDJobSPU* __restrict pJob;
		uint32 curGet _ALIGN(16) = 0;
		uint32 curPut = 0;
		uint32 jobType = 0;
		int swapped = 0;
	#if defined(DO_SPU_PROFILING)
		int dumpStats = 0;
		int statsStarted = 0;
	#endif
	#if defined(SUPP_SPU_FRAME_STATS)
		uint32 frameTime = 0;
		NPPU::SFrameProfileRSXData frameProfileData;
		uint32 jobStartTime;
		int printFrameStats = 0;//printf version of stats
		uint32 curTicks  = 0;
	#endif

		uint32 jobEndTime = rdtsc();
		cellGcmInitLocalGcmContext(&thisContext, &g_LocalContext, g_LocalCMDBuf, LOCAL_SPU_CMD_BUF_SIZE, g_LocalPSBuf, LOCAL_PS_BUFFER_SIZE, g_LocalVSBuf, LOCAL_VS_BUFFER_SIZE);
		__spu_unregister_job();
		/*volatile */const CDXPSRDJobSPU* __restrict pJobLocal = (CDXPSRDJobSPU*)g_JobCopy;
		uint32 curJobIdx = 0;//index into g_JobCopy
		do
		{
			WHILE(curPut == curGet || !(jobType & (uint32)EDXPSJT_SPU),0)//no volatile lookup
			{
				IF(swapped,0)
				{
					//clean once a frame
					__spu_cleanup_memory();
					swapped = 0;
					printFrameStats = (gPS3Env->spuDumpProfStats==2)?1:0;
				}
				FetchPutGet((uint32)&m_CMDBPut,(uint32)&m_CMDBGet,(uint32)&m_RSXPushOffset,curPut,curGet, g_LocalContext.rsxPushOffset);
	#if defined(DABR_PPU)
				__spu_set_dabr_ppu(g_DABRPPU);
	#endif
				IF(curPut != curGet,1)
				{
					pJob	=	reinterpret_cast<CDXPSRDJobSPU*>(&m_CMDBuffer[curGet]);
	 				__spu_dma_to_ls_no_cache_no_sync((void*)pJobLocal, pJob, JOB_STACK_SIZE_MAX);
 					__spu_sync_dma_no_cache_no_sync();
					jobType		= pJobLocal->FullTypeVolatile();
					IF(!(jobType & (uint32)EDXPSJT_SPU), 0)
						ppuExecuted = true;
					IF(jobType == (uint32)EDXPSJ_EXIT, 0)
					{
						curGet = curPut;
						__spu_dma_to_main_no_cache_no_sync(&m_CMDBGet,&curGet,4);//using same dma tag as above
						m_Suspend = 0;
#ifdef _RELEASE
						m_Finished = 1;
#endif
						__spu_sync_dma_no_cache_no_sync();
						return;//enable possible exit of thread, make sure it is flagged as flushing job
					}
				}
				else
				{
					uint32 curSuspend = m_Suspend;
					IF(curSuspend == 3, 0)//another force exit attempt
					{
						curGet = curPut;
						__spu_dma_to_main_no_cache_no_sync(&m_CMDBGet,&curGet,4);//using same dma tag as above
						m_Suspend = 0;
#ifdef _RELEASE
						m_Finished = 1;
#endif
						__spu_sync_dma_no_cache_no_sync();
						return;//enable possible exit of thread, make sure it is flagged as flushing job
					}
#if defined(CRY_DXPS_DOWNLOADABLE_VMEM)
					const unsigned int curRdtsc = rdtsc();
					//do only issue if spu is enabled, it has ran 1 job and once a ms
					IF(!ppuExecuted && (curSuspend==2 || (jobEndTime - curRdtsc > 80000)), 0)	//1ms
					{
						volatile uint32 *pSpuEnabledVar = (uint32*)&gPS3Env->spuEnabled;
						IF(curSuspend!=1 && *pSpuEnabledVar > 0,1)
						{
							CELL_GCM_FLUSH;
							cellGcmSyncTransferToMain();//sync command buffer transfer back (reused command tag)
							if(curSuspend == 2)
								m_Suspend = 0;
						}
						jobEndTime = curRdtsc;
					}
	#endif
				}
			}
	#if defined(SUPP_SPU_FRAME_STATS)
			jobStartTime = rdtsc();
	#endif
			const uint32 cMaskedType = jobType & EDXPSJOB_MASK;
			IF(ppuExecuted, 0)
			{
				cellGcmUpdateGlobalPPUContext();
				//copy worker from ppu
				memcpy(&g_WorkerLS, &m_Worker, sizeof(CDXPSRDWorker));
				g_WorkerLS.UpdateShader(m_pLastPixelShader, m_pLastVertexShader);
				ppuExecuted = false;
			}

			const uint32 cSize = pJobLocal->Size();
#ifndef CRY_DXPS_LSCONSTANTS
			IF(cSize > JOB_STACK_SIZE_MAX, 0){printf("Job(type=%d) exceeds 256 bytes (%d bytes)\n",cMaskedType,cSize);snPause();}
#endif
			NextJob<false>(curGet, cSize);
			pJob	=	reinterpret_cast<CDXPSRDJobSPU*>(&m_CMDBuffer[curGet]);
			curJobIdx = (curJobIdx + 1) & 1;
			__spu_dma_to_ls_no_cache_no_sync(&g_JobCopy[curJobIdx*JOB_STACK_SIZE_MAX], pJob, JOB_STACK_SIZE_MAX);
			g_WorkerLS.WorkOn(SPU_LOCAL_PTR((CDXPSRDJobSPU*)pJobLocal), cMaskedType);
			pJobLocal = (CDXPSRDJobSPU*)&g_JobCopy[curJobIdx*JOB_STACK_SIZE_MAX];
			IF(IsFlushingMaskJobType(cMaskedType),0)
			{
				SPU_FRAME_PROFILE_SECTION("CDXPSRDThread::RunSPU_Flush")
				__spu_dma_to_main_no_cache_no_sync(&m_CMDBGet,&curGet,4);//using same dma tag as above
	#if defined(SUPP_SPU_FRAME_STATS)
				const uint32 cFlushStart = rdtsc();
	#endif
				__spu_flush_cache();
				cellGcmSyncTransferToMain();//sync command buffer transfer back (reused command tag)
	#if defined(SUPP_SPU_FRAME_STATS)
				cellGcmAddPerfTicks2(cFlushStart - rdtsc());
	#endif
			}

			//if we have a swap command, write frame stats back and reset timer
			swapped = (cMaskedType == EDXPSJ_SWAP)?1:0;
			__spu_sync_dma_no_cache_no_sync();
			jobType	= (curPut != curGet)?pJobLocal->FullType() : 0;
			jobEndTime = rdtsc();
	#if defined(SUPP_SPU_FRAME_STATS)
			frameTime += jobStartTime - jobEndTime;
	#endif
			IF(swapped, 0)
			{
				//transfer current pixel shader cache back to ppu to enable proper ppu/spu switching
				__spu_dma_to_main_no_cache_no_sync(SPU_MAIN_PTR((void*)m_Worker.PSAddr()), SPU_LOCAL_PTR((void*)g_WorkerLS.PSAddr()), sizeof(CCryDXPSGCMPixelshaderCacheMan));
		#if defined(DO_SPU_PROFILING)
				dumpStats |= gPS3Env->spuDumpProfStats;//ensure it suffices if it is set once
		#endif
		#if defined(SUPP_SPU_FRAME_STATS)
//				IF(gPS3Env->profileEnabled,0)
				{
					//convert to usec
					const float cInvTB	= 1000.f / 79800.f;//inverse ticks per usec
					const uint32 cRSXWaitTime		= cellGcmGetAndResetRSXWaitTicks();
					const uint32 cRSXPerfTime0	= cellGcmGetAndResetPerfTicks0();
					const uint32 cRSXPerfTime1	= cellGcmGetAndResetPerfTicks1();
					const uint32 cRSXPerfTime2	= cellGcmGetAndResetPerfTicks2();
					const uint32 cRSXPerfTime3	= cellGcmGetAndResetPerfTicks3();
	//				frameTime -= cRSXWaitTime;
					__spu_transfer_frame_stats(frameTime);
					frameProfileData.frameTime				= (uint32)((float)frameTime     * cInvTB);
					frameProfileData.rsxWaitTime			= (uint32)((float)cRSXWaitTime  * cInvTB);
					frameProfileData.psTime						= (uint32)((float)cRSXPerfTime0 * cInvTB);
					frameProfileData.vsTime						= (uint32)((float)cRSXPerfTime1 * cInvTB);
					frameProfileData.flushTime				= (uint32)((float)cRSXPerfTime2 * cInvTB);
					frameProfileData.inputLayoutTime	= (uint32)((float)cRSXPerfTime3 * cInvTB);
					//copy back unconditional, will be finished til next swap
					__spu_dma_to_main_no_cache_no_sync((void*)&m_FrameStatsSPU, (void*)&frameProfileData, sizeof(NPPU::SFrameProfileRSXData));
					curTicks += 0xFFFFFFFFU - jobEndTime;
					//now handle printf version of stats
					if(printFrameStats && curTicks > 160000000/*ca 2 secs*/)
					{
						const float cConvFactor = 1.f/1000.f;
						float frameTimeSPU			= (float)frameProfileData.frameTime * cConvFactor;
						float frameTimeWaitRSX	= (float)frameProfileData.rsxWaitTime * cConvFactor;
						float frameTimePerfPS		= (float)frameProfileData.psTime * cConvFactor;
						float frameTimePerfVS		= (float)frameProfileData.vsTime * cConvFactor;
						float frameTimeFlush		= (float)frameProfileData.flushTime * cConvFactor;
						float frameTimeIL				= (float)frameProfileData.inputLayoutTime * cConvFactor;

						printf("rsx/ps/vs/fl/il: %2.1f / %2.1f / %2.1f / %2.1f / %2.1f\n",
							frameTimeWaitRSX, frameTimePerfPS, frameTimePerfVS, frameTimeFlush, frameTimeIL);
						curTicks = 0;
					}
					__spu_transfer_func_prof_stats();
				}
		#endif
		#if defined(DO_SPU_PROFILING)
				if(statsStarted)
				{
					__spu_dump_prof_stats();//dump and reset stats
					statsStarted = 0;
				}
				else
				if(dumpStats == 1)
					statsStarted = 1;//mark start of stats capturing (full frame)
				__spu_reset_prof_stats();//always reset for full frame
				dumpStats = 0;
		#endif
				frameTime = 0;
			}
			__spu_reset_timer();
			jobEndTime = rdtsc();//make sure it is updated as it is fetched inside get/put idle loop
		}
		WHILE(1,1);
	}
#endif //__SPU__
