/*
	relocatable and on demand linkable version of SPU side implementation of Run
	since cache is not written back before FlushCacheComplete, either use FlushCacheRange on the memory locations which matter
		or pass true to cEnableAtParentExit in RunSPUJob (which then will be spawned just after flushing the cache of the parent job)
	
	only bubble mode is supported
	no dependent jobs are supported
	no queue is supported
	no profiling is supported
*/ 

#if defined(PS3)
#if defined(__SPU__)

#if !defined(eCryModule)
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include <IJobManSPU.h>
#include "../Memory.h"
#include "../SPUUtilities.h"
#include "../Cache/Cache_spu.h"
#include "JobStructs.h"
#include "../PPU/SPUJobBase.h"


using NSPU::NDriver::SInfoBlock;
using NPPU::SQueueNodeSPU;
using NPPU::CCommonDMABase;
using NPPU::CSPUPacketBase;
using NSPU::MemcpyMainFenced;

namespace
{
	ILINE void DisableParentJobState()
	{
		//this makes the parent job not call any callback or set the external job state
		vec_uint4 *const __restrict pJobSpawnReg = (vec_uint4*)(void*)G_SPU_JOB_SPAWN_REG;
		*pJobSpawnReg = spu_insert(1, *pJobSpawnReg, JOB_SPAWN_STATE_WORD);
	}

	ILINE void Lock(const uint32 cSpinLockEA, const uint32 cLockAreaLS)
	{
		SPUSyncAtomicDCache();
		const uint32 cEAAddr = cSpinLockEA;
		uint32 llEvent;
#if !defined(FAST_UNSAFE_LL_ENABLE)
		spu_write_event_mask(0);//discard previous (or phantom) events, as needed
		IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
#endif//FAST_UNSAFE_LL_ENABLE
		spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
		mfc_getllar_prep((volatile void*)cLockAreaLS, cEAAddr);
		volatile int * const pLSVal = (volatile int*)(void*)cLockAreaLS;
		do
		{
Looping://compiler workarround
			mfc_getllar_again();
			mfc_read_atomic_status();
			//		spu_dsync();
			IF(*pLSVal != 0, false)
			{
				//wait for any write to the reserved cache line, snoop on a write to push
				llEvent = spu_readch(MFC_RD_EVENT_STATUS);
				spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);
				goto Looping;
			}
			*pLSVal = 1;
			mfc_putllc_again();
		}
		WHILE(mfc_read_atomic_status() != 0, false);

		spu_write_event_mask(0);//disable lock line events
#if !defined(FAST_UNSAFE_LL_ENABLE)
		IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
#endif//FAST_UNSAFE_LL_ENABLE
	}

	void GetSPUJobSlot
	(
		const uint32 cJobQueueEA,
		uint32& __restrict rJobSlot, 
		uint32& __restrict rNextPush, 
		NSPU::NDriver::SJobQueuePos& rJobQueuePush,
		SInfoBlock& rInfoBlockLS
	)
	{
		uint32 curPush = rJobQueuePush.curAddr;
		while(1)//continue til we found an empty job slot
		{
			rNextPush = curPush + NSPU::NDriver::scSizeOfSJobQueueEntry;
			rNextPush = (rNextPush == rJobQueuePush.topAddr)?rJobQueuePush.baseAddr : rNextPush;
			//now check if job in push job slot has been finished, if not, increment push address and mark as not to be pulled next time
			rJobSlot = (uint32)((curPush - rJobQueuePush.baseAddr) >> NSPU::NDriver::scSizeOfSJobQueueEntryShift);
			assert(rJobSlot < NPPU::scMaxWorkQueueJobs);
			//transfer rInfoBlockLS here (&m_SPUJobQueue.jobInfoBlocks[rJobSlot])
			const uint32 cInfoBlockEA = 
				cJobQueueEA + (rJobSlot << NSPU::NDriver::scSizeOfSJobQueueEntryShift) + 
				NPPU::scJobInfoBlocksPullAddressDiff + NPPU::scJobInfoPushPullAddressDiff;
			si_wrch(MFC_LSA,si_from_ptr(&rInfoBlockLS));
			si_wrch(MFC_EAL,si_from_uint(cInfoBlockEA));
			si_wrch(MFC_Size,si_from_uint(128));
			si_wrch(MFC_TagID,si_from_uint(0));
			si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));
			//sync transfer
			spu_writech(MFC_WrTagMask, 1<<0);
			spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
			spu_readch(MFC_RdTagStat);
			IF(!rInfoBlockLS.jobState.IsRunning(), true)
				return;
			rInfoBlockLS.SetFetchable(false);//not to be fetched this round (still running)
			curPush += NSPU::NDriver::scSizeOfSJobQueueEntry;
			curPush = (curPush == rJobQueuePush.topAddr)?rJobQueuePush.baseAddr : curPush;
		}
	}

	void CreateDMAListSingle
	(
		const uint32 cJobParamSize,
		uint8* const __restrict pParamAddr,
		const NPPU::CSPUJobDel& crJob
	)
	{
		assert((cJobParamSize & 0xF) == 0);
		assert(cJobParamSize < NSPU::NDriver::SInfoBlock::scAvailParamSize - 4/*space for no packet ptr for simplicity*/);
		uint8 *__restrict pCurParamDataDest = (uint8*)pParamAddr;
		const CCommonDMABase* __restrict pCurPacketData = (const CCommonDMABase*)&crJob;
		const void* const __restrict cpPacketSrc = pCurPacketData->GetJobParamData();
		uint8* __restrict pCurParamDataDest8	= (uint8*)pCurParamDataDest;
		uint8* __restrict pCurParamDataSrc8		= (uint8*)cpPacketSrc;
		for(int i=0; i<cJobParamSize; ++i)
			pCurParamDataDest8[i] = pCurParamDataSrc8[i];//save code size and do not use memcpy
		pCurParamDataDest += cJobParamSize;
		*(uint32*)pCurParamDataDest = NSPU::NDriver::SInfoBlock::scNoPacketVal;//signal no more packets
	}

	void CreateDMAList
	(
		const uint32 cJobParamSize,
		NSPU::NDriver::SInfoBlock& __restrict rInfoBlock,
		const NPPU::CSPUJobDel& crJob,
		volatile const CCommonDMABase* __restrict * __restrict ppPackets,
		const uint32 cPacketCount
	)
	{
		//allocate memory for packets and set packet pointer accordingly 
		uint8* const __restrict pParamAddr = (uint8*)rInfoBlock.GetParamAddress();
		uint8 *__restrict pCurParamDataDest = (uint8*)pParamAddr;
		const CCommonDMABase* __restrict pCurPacketData = (const CCommonDMABase*)&crJob;
		//iterate all packets (main one plus added packets)
		int packet = 0;//packets to add, main packet comes first (<=)
		const uint32 cEndParamArea = (uint32)rInfoBlock.GetParamAddress() + NSPU::NDriver::SInfoBlock::scAvailParamSize;
		while(1)
		{
			assert((uint32)pCurParamDataDest + cJobParamSize <= cEndParamArea);
			//copy param data
			const void* const __restrict cpPacketSrc = pCurPacketData->GetJobParamData();

			uint8* __restrict pCurParamDataDest8	= (uint8*)pCurParamDataDest;
			uint8* __restrict pCurParamDataSrc8		= (uint8*)cpPacketSrc;
			for(int i=0; i<cJobParamSize; ++i)
				pCurParamDataDest8[i] = pCurParamDataSrc8[i];//save code size and do not use memcpy
			pCurParamDataDest += cJobParamSize;
			if(packet == cPacketCount)
			{
				if((uint32)pCurParamDataDest < cEndParamArea)//otherwise a whole packet would get wasted due to 16 byte alignment
					*(uint32*)pCurParamDataDest = NSPU::NDriver::SInfoBlock::scNoPacketVal;//signal no more packets
				break;
			}
			pCurPacketData = (CCommonDMABase* __restrict)ppPackets[packet];
			++packet;
		}//packet loop
	}

	ILINE void TransferPacketSyncIndexBack(const uint32 cPacketSyncEA, const uint8 cPacketCount, const uint32 cPacketIndex)
	{
		//this must be done synchronously since other SPUs use same cache line for synchronization
		uint8 spuPacketSyncLS[128] _ALIGN(128);
		uint32 llEvent;
		int status;
#if !defined(FAST_UNSAFE_LL_ENABLE)
		spu_write_event_mask(0);//discard previous (or phantom) events, as needed
		IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
#endif//FAST_UNSAFE_LL_ENABLE
		spu_write_event_mask(MFC_LLAR_LOST_EVENT);//enable MFC_LLAR_LOST_EVENT
		mfc_getllar_prep(spuPacketSyncLS, cPacketSyncEA);
		//first obtain lock
		do
		{
			status = 1;
			mfc_getllar_again();
			mfc_read_atomic_status();

			IF(spuPacketSyncLS[0] == 0, true)//still unlocked
			{
				spuPacketSyncLS[0] = 1;
				mfc_putllc_again();
				status = mfc_read_atomic_status();
			}
			else
			{
				llEvent = spu_readch(MFC_RD_EVENT_STATUS);
				spu_writech(MFC_WR_EVENT_ACK, MFC_LLAR_LOST_EVENT);//wait for a lost line event on this cache line
				continue;
			}
		}
		WHILE(status != 0, false);
		do
		{
			mfc_getllar_again();
			mfc_read_atomic_status();
			spuPacketSyncLS[0] = 0;//unlock
			spuPacketSyncLS[cPacketIndex] += cPacketCount;//to be counted down	
			mfc_putllc_again();
			status = mfc_read_atomic_status();
		}
		WHILE(status != 0, false);
		spu_write_event_mask(0);//disable lock line events
#if !defined(FAST_UNSAFE_LL_ENABLE)
		IF(spu_readchcnt(MFC_RD_EVENT_STATUS) != 0, false)
		{
			llEvent = spu_readch(MFC_RD_EVENT_STATUS);
			spu_writech(MFC_WR_EVENT_ACK, llEvent);
		}
#endif//FAST_UNSAFE_LL_ENABLE
	}
}

const NPPU::EAddJobRes RunSPUJob
(
	const NPPU::CSPUJobDel& crJob,
	const uint32 cOpMode,
	const unsigned char cMinStackSizeKB,
	const uint32 cJobAddress,
	const bool cEnableAtParentExit,
	const unsigned short cJobId
)
{
	//dma areas
	uint8 jobDataArea[256] _ALIGN(128);//job might cross 128 byte boundary
	uint8 jobQueuePushArea[128] _ALIGN(128);//LS memory for push
	NSPU::NDriver::SJobQueuePos& rJobQueuePush = *(NSPU::NDriver::SJobQueuePos*)(void*)jobQueuePushArea;
	uint8 dmaLockArea[128] _ALIGN(128);
	uint8 infoBlockArea[128] _ALIGN(128);

	SInfoBlock& infoBlockLS			= *(SInfoBlock*)(void*)infoBlockArea;
	uint32 jobSlot;
	uint32 nextPush;
	const uint32 cJobQueueEA = (((NSPU::SBubbleDirInfo*)(void*)G_SPU_BUB_DIR_INFO)->ppuSyncEA);
	const uint32 cSpinLockEA = cJobQueueEA + sizeof(SQueueNodeSPU);

	NPPU::SJobData callBackData _ALIGN(16);//for callback
	const uint32 cCallBackBaseEA = NPPU::scJobInfoBlocksJobDataSyncAddressDiff + cJobQueueEA;

	//transfer job data here
	assert(cJobAddress != 0);
	si_wrch(MFC_LSA,si_from_ptr(jobDataArea));
	si_wrch(MFC_EAL,si_from_uint(cJobAddress & ~127));
	si_wrch(MFC_Size,si_from_uint(256));
	si_wrch(MFC_TagID,si_from_uint(0));
	si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));

	Lock(cSpinLockEA, (uint32)dmaLockArea);

	//transfer m_SPUJobQueue from PPU here
	si_wrch(MFC_LSA,si_from_ptr(jobQueuePushArea));
	si_wrch(MFC_EAL,si_from_uint(cJobQueueEA));
	si_wrch(MFC_Size,si_from_uint(128));
//	si_wrch(MFC_TagID,si_from_uint(0));
	si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));

	const SInfoBlock& __restrict g_sParentInfoBlock = *(SInfoBlock*)(void*)G_SPU_INFO_BLOCK;
	uint8& rCurSpuPacketSyncIndex = *((uint8*)dmaLockArea + 4);//located right behind spin lock

	//init spu packet info
	uint32 packetSyncIndex = SInfoBlock::scNoIndex;
	const uint32 cSpuPacketSyncEA = 
		cJobQueueEA + NPPU::scJobInfoBlocksSPUPacketSyncAddressDiff + NPPU::scJobInfoPushPullAddressDiff;
	const bool cParentIsSyncing = (g_sParentInfoBlock.spuPacketSyncIndex != SInfoBlock::scNoIndex);
	volatile const CCommonDMABase** __restrict ppPackets;
	volatile const CSPUPacketBase** __restrict ppSPUPackets;
	uint32 packetCount = 0, spuPacketCount = 0;

	crJob.GetAllPackets(packetCount, spuPacketCount, ppPackets, ppSPUPackets);//always valid pointer (static array)
	//if parent is syncing, assign the same index and add this instance + spuPacketCount
	const uint32 cSyncPacketCount = cParentIsSyncing?(1+spuPacketCount) : spuPacketCount;

	IF(spuPacketCount > 0, false)
	{
		IF(!cParentIsSyncing, true)
		{
			packetSyncIndex = rCurSpuPacketSyncIndex--;
			//since we have index 0 reserved for locking, we need to decrement for branch free behavior
			rCurSpuPacketSyncIndex = (rCurSpuPacketSyncIndex == 0)?SQueueNodeSPU::scSyncMaxIndex : rCurSpuPacketSyncIndex;
		}
		else
			packetSyncIndex = g_sParentInfoBlock.spuPacketSyncIndex;
	}
	else
		packetSyncIndex = cParentIsSyncing? g_sParentInfoBlock.spuPacketSyncIndex : SInfoBlock::scNoIndex;

	//sync transfer
	spu_writech(MFC_WrTagMask, 1<<0);
	spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
	spu_readch(MFC_RdTagStat);

	GetSPUJobSlot(cJobQueueEA, jobSlot, nextPush, rJobQueuePush, infoBlockLS);

	bool transferActive = true;
	const bool cHasCallback = (g_sParentInfoBlock.callbackIndex != SInfoBlock::scNoIndex);
	IF(cHasCallback, false)
	{
		const uint32 cExistJobSlot = (uint32)g_sParentInfoBlock.callbackIndex;
		const uint32 cJobDataEA = cCallBackBaseEA + cExistJobSlot*sizeof(NPPU::SJobData);
		si_wrch(MFC_LSA,si_from_ptr(&callBackData));
		si_wrch(MFC_EAL,si_from_uint(cJobDataEA));
		si_wrch(MFC_Size,si_from_uint(sizeof(NPPU::SJobData)));
//		si_wrch(MFC_TagID,si_from_uint(0));
		si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));
	}

	NBubBin::SJob *pJob = (NBubBin::SJob*)&jobDataArea[cJobAddress & 127];
	void* __restrict pJobProgramData	= (void*)cJobAddress;
	const uint32 cJobProgramSize			= ((uint32)pJob->totalJobSize << 2);//stored in multiple of 4 bytes
	assert(pJob->initialBubbles[0] >= 0);
	const uint16 cFirstBubIndex				= (uint16)pJob->initialBubbles[0];
	//we reuse the external job state
	const uint32 cExternalJobState = g_sParentInfoBlock.GetExtJobStateAddress();
	//setup for all spu packets
	const uint32 cOrigParamSize = crJob.GetParamDataSize();
	const uint8 cParamSize	= cOrigParamSize >> 4;
	const uint16 cJobSize		= cJobProgramSize >> 2;
	assert((cJobProgramSize & 3) == 0);

	infoBlockLS.Reset(NULL, true, cMinStackSizeKB);
	infoBlockLS.eaDMAJobAddress					= (uint32)pJobProgramData;
	infoBlockLS.jobSize									= cJobSize;
	infoBlockLS.paramSize								= cParamSize;
	infoBlockLS.SetOpMode(cOpMode);
	infoBlockLS.firstBubbleIndex				= cFirstBubIndex;
	infoBlockLS.jobId										= cJobId;
	infoBlockLS.spuPacketSyncIndex			= packetSyncIndex;

	//get DMA transfer entries from job base
	IF(packetCount == 0, true)
		CreateDMAListSingle(cOrigParamSize, infoBlockLS.GetParamAddress(), crJob);
	else
		CreateDMAList(cOrigParamSize, infoBlockLS, crJob, ppPackets, packetCount);

	IF(cExternalJobState, true)
		infoBlockLS.SetExtJobStateAddress(cExternalJobState);

	//register callback
	IF(cHasCallback, false)
	{
		//copy original data from parent job slot
		IF(transferActive, true)
		{
			//sync transfer
			spu_writech(MFC_WrTagMask, 1<<0);
			spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
			spu_readch(MFC_RdTagStat);
			transferActive = false;
		}
		//copy callback data
		const uint32 cCurJobDataEA = cCallBackBaseEA + jobSlot*sizeof(NPPU::SJobData);
		MemcpyMainFenced(cCurJobDataEA, &callBackData, sizeof(NPPU::SJobData), 0);
		infoBlockLS.callbackIndex	= jobSlot;
	}	
	assert(jobSlot < 256);

	rJobQueuePush.curAddr = nextPush;//entry usage is safe til we increment push address so that pull address can fetch last set entry
	const uint32 cParentJobSlot = jobSlot;

	const uint32 *const pInfoBlockRemaining	= (uint32*)&infoBlockLS.jobId;
	//now set up the spu packet jobs, first one has been made available already
	//dependent jobs can only be attached to the main job

	uint8 *pAddInfoBlockMem = (uint8*)alloca(NSPU::NDriver::scSizeOfSJobQueueEntry * spuPacketCount);
	uint32 syncMask = (1<<0);
	for(int i=0; i<spuPacketCount; ++i)
	{
		SInfoBlock& __restrict rAddInfoBlockLS = *(SInfoBlock*)&pAddInfoBlockMem[i << NSPU::NDriver::scSizeOfSJobQueueEntryShift];
		const CSPUPacketBase& __restrict crSPUPacket = (const CSPUPacketBase&)*ppSPUPackets[i];
		GetSPUJobSlot(cJobQueueEA, jobSlot, nextPush, rJobQueuePush, rAddInfoBlockLS);
		rAddInfoBlockLS.eaDMAJobAddress				= (uint32)pJobProgramData;
		rAddInfoBlockLS.eaExtJobStateAddress	= infoBlockLS.eaExtJobStateAddress;
		//perform fast copy op
		uint32 *pAddInfoBlockRemaining	= (uint32*)&rAddInfoBlockLS.jobId;
		pAddInfoBlockRemaining[0]				= pInfoBlockRemaining[0];
		pAddInfoBlockRemaining[1]				= pInfoBlockRemaining[1];
		pAddInfoBlockRemaining[2]				= pInfoBlockRemaining[2];

		rAddInfoBlockLS.depJobIndex			= NSPU::NDriver::SInfoBlock::scNoIndex;
		//apply callback, since only one raises it, copy from main job
		IF(cHasCallback, false)
		{
			//copy callback data
			const uint32 cAddJobDataEA = NPPU::scJobInfoBlocksJobDataSyncAddressDiff + cJobQueueEA + jobSlot*sizeof(NPPU::SJobData);
			MemcpyMainFenced(cAddJobDataEA, &callBackData, sizeof(NPPU::SJobData), g_MemCpyTempTag);
			rAddInfoBlockLS.callbackIndex	= jobSlot;
		}
		volatile const CCommonDMABase** __restrict ppAddPackets;
		uint32 addPacketCount;
		crSPUPacket.GetPackets(addPacketCount, ppAddPackets);//always valid pointer (static array)
		const NPPU::CSPUJobDel& __restrict crJobDel = crSPUPacket.m_JobDelegator;
		if(packetCount == 0)
			CreateDMAListSingle(cOrigParamSize, rAddInfoBlockLS.GetParamAddress(), crJob);
		else
			CreateDMAList(cOrigParamSize, rAddInfoBlockLS, crJobDel, ppAddPackets, addPacketCount);

		//mark as fetchable
		rAddInfoBlockLS.SetFetchable(true);

		rJobQueuePush.curAddr = nextPush;
		rAddInfoBlockLS.jobState.running = 1;

		//transfer back
		const uint32 cAddInfoBlockEA = 
			cJobQueueEA + (jobSlot << NSPU::NDriver::scSizeOfSJobQueueEntryShift) + 
			NPPU::scJobInfoBlocksPullAddressDiff + NPPU::scJobInfoPushPullAddressDiff;
		MemcpyMainFenced(cAddInfoBlockEA, &rAddInfoBlockLS, NSPU::NDriver::scSizeOfSJobQueueEntry, g_MemCpyTempTag);
		syncMask = (1<<0) | (1 << g_MemCpyTempTag);
	}//SPU packet loop

	//transfer back
	const uint32 cInfoBlockEA = cJobQueueEA + (cParentJobSlot << NSPU::NDriver::scSizeOfSJobQueueEntryShift) + 
		NPPU::scJobInfoBlocksPullAddressDiff + NPPU::scJobInfoPushPullAddressDiff;
	MemcpyMainFenced(cInfoBlockEA, &infoBlockLS, 128, 0);
	*(volatile int*)(void*)dmaLockArea = 0;//set unlock value

	IF(cSyncPacketCount > 0, false)
		TransferPacketSyncIndexBack(cSpuPacketSyncEA, cSyncPacketCount, packetSyncIndex);

	//if job should be enable at parents exit due to flushing of cache, register push pointer and keep lock
	uint32 spinLockSize = 16;
	uint32 spinLockSizeOffset = 0;
	if(cEnableAtParentExit)
	{
		spinLockSize = 4;
		spinLockSizeOffset = 4;
		SetEnableSPUJobAtParentExit(rJobQueuePush.curAddr);
	}
	else
	{
		//important set push pointer last (so that all data are valid once the other SPUs fetch them)
		MemcpyMainFenced(cJobQueueEA, jobQueuePushArea, 128, 0);
	}
	//now unlock
	MemcpyMainFenced(cSpinLockEA+spinLockSizeOffset, (dmaLockArea+spinLockSizeOffset), spinLockSize, 0);

	DisableParentJobState();

	//sync transfer since stack goes out of scope
	spu_writech(MFC_WrTagMask, syncMask);
	spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); 
	spu_readch(MFC_RdTagStat);

	return NPPU::eAJR_Success;
}

#endif //__SPU__
#endif //PS3
