/*
	implementation of ppu callback
*/ 

#if defined(PS3) && defined(__SPU__)

#if !defined(eCryModule)
	#define eCryModule eCryM_Launcher
#endif
#include <CryModuleDefs.h>
#include <platform.h>
#include <stdio.h>
#include "../Memory.h"
#include "../JobStructs.h"
#include "../SPUMemManagerBase.h"
#include "../SPUMemManager_spu.h"

extern void* Memset__VM(void* pDest, int cVal, const uint32 cSize);

//returns the bucket index for a certain size
inline const unsigned int GetBucketIndex(const int32 cVal)
{
	const int32 cClampedVal = CondSelMax(cVal, 32);
	return (32-NSPU::SBucketInfo::scBucketSizeMinLog2) - spu_extract(spu_cntlz(spu_promote(cClampedVal-1, 0)), 0);
}

inline NSPU::CSPUMemMan& GetMemMan()
{
	return *(NSPU::CSPUMemMan*)G_SPU_MEM_MAN;
}


inline void* cache_lookup_ex
(
	const void* const __restrict cpSPUPtrExist, 
	const void* const __restrict cpPPUPtrExist,
	const void* const __restrict cpPtr
)
{
	const unsigned int cAlignedPPUEA		 = (unsigned int)cpPPUPtrExist & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedPPtrPPUEA = (unsigned int)cpPtr & ~scSPUCacheLineSizeMask;
	const unsigned int cAlignedSPUEA		 = (unsigned int)cpSPUPtrExist & ~scSPUCacheLineSizeMask;
	const unsigned int cOffsetPPUPtr		 = (unsigned int)cpPtr & scSPUCacheLineSizeMask;
	IF(cAlignedPPtrPPUEA == cAlignedPPUEA, true)//if in the same cache line, reuse
		return (void*)(cAlignedSPUEA + cOffsetPPUPtr);
	return __spu_cache_lookup((unsigned int)cpPtr, 1, 128);
}

//alloc entry for PPU side allocations
void PPUAlloc_func(void*& rpPtr, const size_t cSize)
{
	IF( !cSize, false ) // dont allocate buckets for size == 0
	{
		rpPtr = NULL;
		return;
	}

	using NSPU::SBucket;
	using NSPU::SBucketHeader;
	using NSPU::SBucketDir;
#if defined(DO_SPU_PROFILING)
	((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->allocSize += cSize;
#endif
	//determine if the allocation can be handled by the bucket system
	unsigned int bucketIndex = GetBucketIndex(cSize);
	//find an available bucket
	WHILE
	(
		bucketIndex < NSPU::SBucketInfo::scBucketCount && 
		GetMemMan().GetBucketHeader(bucketIndex).available == 0, false
	){++bucketIndex;}

#if defined(DO_SPU_PROFILING)
	g_ProfID = (vec_uint4)PROF_ID_PPU_ALLOC;
#endif

	if(bucketIndex < NSPU::SBucketInfo::scBucketCount)
	{
		//handled by bucket allocator
		SBucket& rBucket = GetMemMan().GetBucketHeader(bucketIndex);
		const unsigned int cBucketHeaderEA = (unsigned int)rBucket.pBucketHeader;
		SBucketHeader* pLocHeader = (SBucketHeader*)__spu_cache_lookup_miss(cBucketHeaderEA, 1, 128);//make cache lookup, always at most one line
		/* idea:
		- get the first index of the available buckets
		- change the index to the new first available one from the one pointed to from this one
		- turn the directory index at the retrieved index into the free list and set the last index to it
		- make the previous last directory index point to the new one
		*/
		uint8* pBucketList = (uint8*)((uint8*)pLocHeader + SBucketHeader::scBucketHeaderSize);
		const unsigned int cDirAddress = cBucketHeaderEA + SBucketHeader::scBucketHeaderSize + rBucket.numTotal;

#if defined(DO_SPU_PROFILING)
		((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->totalBucketAllocSize += rBucket.size;
#endif
		const uint8 cCurIndex = pLocHeader->listIndex;
		const unsigned int cPPUAddressDirEntry = cDirAddress + (cCurIndex << 2);
		SBucketDir *pDirEntry = (SBucketDir*)__spu_cache_lookup_miss(cPPUAddressDirEntry, 1, 128);		//get from PPU, can be multiple lines
		const unsigned int cRetrievedAddress = pDirEntry->address;
		rpPtr = (void*)cRetrievedAddress;//obtain address and assign as void pointer to the memory block
		pLocHeader->listIndex = pBucketList[cCurIndex];//update first available block index
		const uint8 cLastDirEndIndex = pLocHeader->dirIndexEnd;//save previous last index into free directory list
		pLocHeader->dirIndexEnd = cCurIndex;//set new last index to current one
		//if we had no index before, assign first index
		pLocHeader->dirIndex = CondSelEq(pLocHeader->dirIndex, BUCKET_NULL, cCurIndex, pLocHeader->dirIndex);
		//if we had an index before, link the previous last index to current one
		if(cLastDirEndIndex != BUCKET_NULL)
		{
			//likely to be in the same cache line
			SBucketDir *pDirListLinkedEntry = 
				(SBucketDir*)cache_lookup_ex
				(
					(void*)pDirEntry,
					(void*)cPPUAddressDirEntry,
					(void*)(cDirAddress + (cLastDirEndIndex << 2))
				);
			pDirListLinkedEntry->SetLinkIndex(cCurIndex);//link to current one, also resets old value
		}
		pDirEntry->SetLinkIndex(BUCKET_NULL);

#if defined(DO_SPU_PROFILING)
		++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->allocsBucket;
#endif
		//write into allocation history table
		GetMemMan().AddHistoryEntry(cRetrievedAddress, bucketIndex);
		--rBucket.available;//written back by DMA
	}
	else
	{
		const unsigned int cEA = (unsigned int)rpPtr;
		NPPU::SPPUMemRequestData memRequest _ALIGN(128);
		memRequest.type = (NPPU::PPUMemRequestType)CondSelNEZ((cEA != 0), (int)NPPU::eMR_ReAlloc, (int)NPPU::eMR_Alloc);
		memRequest.address = cEA;
		uint32 sizePow2 = (uint32)cSize;
		//align size to the next power of 2 if necessary
		if(cSize > 128)
			sizePow2 = (sizePow2+127)&~127;//128 byte alignment biggest povided
		else
		{
			sizePow2 = 1<<(31-spu_extract(spu_cntlz(spu_promote(sizePow2,0)),0));
			sizePow2 = (sizePow2==(uint32)cSize)?(uint32)cSize:(sizePow2<<1);
		}
		memRequest.size = sizePow2;
		memRequest.valid = 1;
		//copy data request to PPU, do not wait for completion, interrupt will be toggled way later (PPU ensures by polling otherwise)
		MemcpyMain(*(unsigned int*)G_SPU_DEST_MEM_AREA, (NSPU::TAddrLS)(&memRequest), NPPU::SPPUMemRequestData::scPPUMemRequestDataTransferSize, g_scDMAPPUMemTag);
		SyncMemory(g_scDMAPPUMemTag);		
		//signal memory request
		spu_writech(SPU_WrOutIntrMbox, (EVENT_MEMCHANGE_PORT << EVENT_PORT_SHIFT));
		//check if somehow the cache is to flush, will be done way earlier than interrupt thread is actually awakened
#if defined(DO_SPU_PROFILING)
		++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->allocsNoBucket;
#endif
		rpPtr = (void*)spu_readch(SPU_RdSigNotify1);//return value is address
	}
	IF((uint32)rpPtr,1)
	{
		const uint32 cStartAddr = (uint32)rpPtr;
		__spu_flush_cache_range(((uint32)rpPtr)&~127,(cSize+127)&~127);
	}
}

//alloc entry for PPU side allocations
void PPUCalloc_func(void*& rpPtr, const size_t cSize, const size_t cNum)
{
	uint32 size = cSize*cNum;
	IF(size==0, false ) // dont allocate buckets for size == 0
	{
		rpPtr = NULL;
		return;
	}
	PPUAlloc_func(rpPtr, size);
	Memset__VM(rpPtr, 0, size);
}

//checks if a memory release is contained in the history table
//if so, it gets reinserted into the allocation list
//no member function due to __spu_cache_lookup macro
inline const bool CheckReleaseWithHistoryEntry(const unsigned int cEA)
{
	using NSPU::SBucket;
	using NSPU::SBucketHeader;
	using NSPU::SBucketDir;
	//compare if we have a hit
	const vec_uint4 cHistoryTable							 = GetMemMan().GetHistoryAllocTable();
	const vec_uint4 cHistoryTableBucketIndices = GetMemMan().GetHistoryAllocBucketIndices();
	vec_uint4 cEA4					  = spu_splats(cEA);
	const vec_uint4 cCmp		  = spu_cmpeq(cEA4, cHistoryTable);
	const vec_uint4 cGather		= spu_gather(cCmp);
	IF(spu_extract(spu_orx(cCmp), 0) == 0, false)
		return false;//we found no hits
	const unsigned int cIndex = spu_extract(spu_cntlz(cGather), 0) - 28;
	//insert memory back into bucket
	const unsigned int cBucketIndex = spu_extract(cHistoryTableBucketIndices, cIndex);//get bucket index
	SBucket& rBucket = GetMemMan().GetBucketHeader(cBucketIndex);
	const unsigned int cBucketHeaderEA = (unsigned int)rBucket.pBucketHeader;
	SBucketHeader* pLocHeader = (SBucketHeader*)__spu_cache_lookup_miss(cBucketHeaderEA, 1, 128);//make cache lookup, always at most one line
	//get first directory entry, reset it to to cEA and make the first index point to next one
	//link last linked list index to it too
	const unsigned int cDirAddressPPUEA = cBucketHeaderEA + SBucketHeader::scBucketHeaderSize + rBucket.numTotal;
	const uint8 cOldFirstDirIndex = pLocHeader->dirIndex;
	SBucketDir *pDirEntry		= (SBucketDir*)__spu_cache_lookup_miss(cDirAddressPPUEA + (cOldFirstDirIndex << 2), 1, 128);		//get from PPU, can be multiple lines
	const uint8 cNextIndex	= pDirEntry->GetLinkIndex();
	pDirEntry->address			= cEA;
	pLocHeader->dirIndex		= cNextIndex;
	//update pLocHeader->dirIndexEnd:  if(pLocHeader->dirIndexEnd == cOldFirstDirIndex) pLocHeader->dirIndexEnd = BUCKET_NULL;
	pLocHeader->dirIndexEnd = CondSelEq(pLocHeader->dirIndexEnd, cOldFirstDirIndex, BUCKET_NULL, pLocHeader->dirIndexEnd);
	//update link list
	uint8* pBucketList = (uint8*)((uint8*)pLocHeader + SBucketHeader::scBucketHeaderSize);
	uint8 dummy;//to assign branch free
	uint8 *pOldLastBlockEntry = CondSelEq(pLocHeader->listIndexEnd, BUCKET_NULL, &dummy, &pBucketList[pLocHeader->listIndexEnd]);
	*pOldLastBlockEntry = cOldFirstDirIndex;//link to current one
	pBucketList[cOldFirstDirIndex] = BUCKET_NULL;//finish linkage here
	pLocHeader->listIndexEnd = cOldFirstDirIndex;//this is the new latest element
	//update pLocHeader->listIndex: if(pLocHeader->listIndex == BUCKET_NULL) pLocHeader->listIndex = cOldFirstDirIndex;
	pLocHeader->listIndex = CondSelEq(pLocHeader->listIndex, BUCKET_NULL, cOldFirstDirIndex, pLocHeader->listIndex);
	return true;
}

//cleans up the allocated and freed memory, to be called if SPU runs a never returning job
void CleanupMemory()
{
	//check if something is to do
	using NSPU::SBucketInfo;
	using NSPU::SBucket;
	const SBucketInfo& crBucketInfo = GetMemMan().GetBucketInfo();
	int somethingToDo = (crBucketInfo.freedCount > 0);
	if(!somethingToDo)
	{
		//check if there are enough bucket slots left for allocations
		for(int i=0; i<SBucketInfo::scBucketCount; ++i)
			somethingToDo |= (GetMemMan().GetBucketHeader(i).available == 0);
	}
	IF(somethingToDo != 0, 0)
	{
		const unsigned int cMemAreaEA			= *(unsigned int*)G_SPU_DEST_MEM_AREA;
		void* const cpBucketSPUAddr = (void*)GetMemMan().GetBucketSPUAddr();
		//transfer bucket headers back, transfer any garbage in front of it (16 bytes) to copy 128 bytes and get peak performance
		MemcpyMain(cMemAreaEA, cpBucketSPUAddr,	128, g_scDMAOutputTag);
		SyncMemory(g_scDMAOutputTag);
		//toggle interrupt for PPU handling of cleanup
		spu_writech(SPU_WrOutIntrMbox, (EVENT_MEM_CLEANUP << EVENT_PORT_SHIFT));
		spu_readch(SPU_RdSigNotify1);//wait til finished
		//transfer bucket header back here from PPU
		MemcpyLS(cpBucketSPUAddr, cMemAreaEA, 128,  g_scDMAPPUMemTag);
		SyncMemory(g_scDMAOutputTag);
	}
}

//free entry for PPU side allocations
void PPUFree_func(void *pPtr)
{
	using NSPU::SBucketInfo;
	const unsigned int cEA = (unsigned int)pPtr;
	IF(cEA != 0, 1)
	{
#if defined(DO_SPU_PROFILING)
		++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->freeCount;
#endif
		//check if allocation is registered in allocation history table, if so, all is handled there
		IF(CheckReleaseWithHistoryEntry(cEA), true)//expect it to be present
		{
#if defined(DO_SPU_PROFILING)
			++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->freeCountHistory;
#endif
			return;
		}
		SBucketInfo& rBucketInfo = GetMemMan().GetBucketInfo();
		IF(rBucketInfo.freedCount < SBucketInfo::scFreedMaxCount, true)
		{
			const unsigned int cFreeEA = (unsigned int)&rBucketInfo.pFreedList[rBucketInfo.freedCount];
			++rBucketInfo.freedCount;
			unsigned int *pFreeSlot = (unsigned int*)(__spu_cache_lookup_miss(cFreeEA, 1, 128));
			*pFreeSlot = cEA;
		}
		else
		{
			__spu_flush_cache();
			//fallback using interrupts, try to avoid
			NPPU::SPPUMemRequestData memRequest _ALIGN(128);
			memRequest.type = NPPU::eMR_Delete;
			memRequest.address = cEA;
			memRequest.valid = 1;
			//copy data request to PPU, do not wait for completion, interrupt will be toggled way later (PPU ensures by polling otherwise)
			MemcpyMain(*(unsigned int*)G_SPU_DEST_MEM_AREA, (NSPU::TAddrLS)(&memRequest), NPPU::SPPUMemRequestData::scPPUMemRequestDataTransferSize, g_scDMAPPUMemTag);
			//signal memory request
			spu_writech(SPU_WrOutIntrMbox, (EVENT_MEMCHANGE_PORT << EVENT_PORT_SHIFT));
			//wait for memcpy to complete, otherwise stack becomes invalid
			SyncMemory(g_scDMAPPUMemTag);

			//wait till delete on ppu has finished(to prevent overwriting with data from a following malloc)
			spu_readch(SPU_RdSigNotify1);

	#if defined(DO_SPU_PROFILING)
			++((NSPU::NDriver::SJobPerfStats*)(void*)G_SPU_CACHE_PROF_PERF_STAT_ADDR)->freeCountNoBucket;
	#endif
		}
	}

}


#endif
