/* 
	cache definition of SPU 4-way set associative software cache
	works as follows: 
		each cache line has 128 bytes
		- 4 way associative: each address goes into a set where it has 4 possible cache dest. locations
		  these line are in consecutive order to be able to support cacheline crossing cached memory accesses
		- a write mask is traced which tells which bits of the cache line are mapped and possibly dirty
		- cache is getting flushed after each job chain
		- currently cache location and size is fixed, this might change due to be more flexible

	some stuff is put into macros to ensure inlined code even for debug modes
*/

#ifndef __CACHE_H
#define __CACHE_H
#pragma once

#if defined(PS3)

#if defined(__SPU__)

#include "CacheDefs_spu.h"
#include "../SPU.h"

namespace NSPU
{
	namespace NCache
		{
//---------------------------------------------macros for DoCheckedLookupCache-----------------------------------
		//set index generation, each address goes into a set where it then has 4 poss. locations
		//the cache set selection is done by bits 7..14
		#define GetCacheSetIndex(cEA) ((cEA>>scSPUCacheLineSizeShift) & (g_SPUNumSets-1))
		#define GetCacheSetIndexGlob(cEA) ((cEA>>scSPUCacheLineSizeShift) & (*(int*)G_SPU_NUM_SETS-1))
		//cache index within corresponding set: (count leading zeros(31 - cFound.x)) -> results in index of address match
		#define GetCacheIndexNum(cFound) spu_extract(spu_sub((unsigned int)31, spu_cntlz(cFound)), 0)
		//get right most bit of 4 element(-> the 4 ways) comparison, x will contain the 4 bits
		#define SetCache4WayLookup(cSet, cEAAligned4) spu_gather(spu_cmpeq(g_pSPUCacheDir[(cSet)], (cEAAligned4)))

//---------------------------------------------cache directory access--------------------------------------------

		//retrieve mapped EA of cache line
		#define GetCacheLineEA(cpSPUCacheDir, cSet, cIndexInSet) spu_extract(cpSPUCacheDir[cSet], cIndexInSet)

		//set mapped EA of cache line
		#define SetCacheLineEA(pSPUCacheDir, cSet, cIndexInSet, cEA) pSPUCacheDir[cSet] = spu_insert(cEA, pSPUCacheDir[cSet], cIndexInSet)

//---------------------------------------------LRU access-----------------------------------------------------

		#define UpdateLRU(rSPUCacheLRUCtrlSet, cIndexInSet, cLRUVal) rSPUCacheLRUCtrlSet = spu_insert(spu_extract(cLRUVal, 0), rSPUCacheLRUCtrlSet, cIndexInSet)
		#define ResetLRU(rSPUCacheLRUCtrlSet, cIndexInSet) rSPUCacheLRUCtrlSet = spu_insert((unsigned int)0, rSPUCacheLRUCtrlSet, cIndexInSet)
		#define ResetLRUEntry(pSPUCacheLRUCtrl, cSet, cIndexInSet) pSPUCacheLRUCtrl[cSet] = spu_insert((unsigned int)0, pSPUCacheLRUCtrl[cSet], cIndexInSet)

//---------------------------------------------LRU index retrieval-------------------------------------------

		//returns index 0..7
		__attribute__((always_inline))
		inline const unsigned int GetReplIndex2(const vec_uint4 cReplCtrlValues0, const vec_uint4 cReplCtrlValues1)
		{
			//third and fourth byte within the cIndexInSet-word is used
			//returns the index with the lowest LRU value (least recently used due to lowest incrementer value)
			const vec_uint4 cVal0							= cReplCtrlValues0;
			const vec_uint4 cVal1							= spu_rlqwbyte(cReplCtrlValues0, 4);
			const vec_uint4 cVal2							= spu_rlqwbyte(cReplCtrlValues0, 8);
			const vec_uint4 cVal3							= spu_rlqwbyte(cReplCtrlValues0, 12);
			const vec_uint4 cVal0_1						= cReplCtrlValues1;
			const vec_uint4 cVal1_1						= spu_rlqwbyte(cReplCtrlValues1, 4);
			const vec_uint4 cVal2_1						= spu_rlqwbyte(cReplCtrlValues1, 8);
			const vec_uint4 cVal3_1						= spu_rlqwbyte(cReplCtrlValues1, 12);
			const vec_uint4 cCmpVec01					= spu_cmpgt(cVal0, cVal1);
			const vec_uint4 cCmpVec23					= spu_cmpgt(cVal2, cVal3);
			const vec_uint4 cCmpVec01_1				= spu_cmpgt(cVal0_1, cVal1_1);
			const vec_uint4 cCmpVec23_1				= spu_cmpgt(cVal2_1, cVal3_1);
			const vec_uint4 cCmpSelRes01			= spu_sel(cVal0, cVal1, cCmpVec01);
			const vec_uint4 cCmpIndexRes01		= spu_sel(spu_promote((unsigned int)0,0), spu_promote((unsigned int)1,0), cCmpVec01);//spu_and(cCmpVec01, 1)
			const vec_uint4 cCmpSelRes01_1		= spu_sel(cVal0_1, cVal1_1, cCmpVec01_1);
			const vec_uint4 cCmpIndexRes01_1	= spu_sel(spu_promote((unsigned int)0,0), spu_promote((unsigned int)1,0), cCmpVec01_1);//spu_and(cCmpVec01_1, 1)
			const vec_uint4 cCmpSelRes23			= spu_sel(cVal2, cVal3, cCmpVec23);
			const vec_uint4 cCmpIndexRes23		= spu_sel(spu_promote((unsigned int)2,0), spu_promote((unsigned int)3,0), cCmpVec23);
			const vec_uint4 cCmpSelRes23_1		= spu_sel(cVal2_1, cVal3_1, cCmpVec23_1);
			const vec_uint4 cCmpIndexRes23_1	= spu_sel(spu_promote((unsigned int)2,0), spu_promote((unsigned int)3,0), cCmpVec23_1);
			const vec_uint4 cCmpVec0123				= spu_cmpgt(cCmpSelRes01, cCmpSelRes23);
			const vec_uint4 cCmpVec0123_1			= spu_cmpgt(cCmpSelRes01_1, cCmpSelRes23_1);
			const vec_uint4 cCmpIndexRes0123	= spu_sel(cCmpIndexRes01, cCmpIndexRes23, cCmpVec0123);
			vec_uint4 cCmpIndexRes0123_1		  = spu_sel(cCmpIndexRes01_1, cCmpIndexRes23_1, cCmpVec0123_1);
			//get largest of both 
			const vec_uint4 cCmpSelRes0123		= spu_sel(cCmpSelRes01, cCmpSelRes23, cCmpVec0123);
			cCmpIndexRes0123_1								= cCmpIndexRes0123_1 + spu_promote((unsigned int)4,0);//to create indices 3..7
			const vec_uint4 cCmpSelRes0123_1	= spu_sel(cCmpSelRes01_1, cCmpSelRes23_1, cCmpVec0123_1);
			const vec_uint4 cCmpVecBoth				= spu_cmpgt(cCmpSelRes0123, cCmpSelRes0123_1);
			const vec_uint4 cCmpIndexResBoth	= spu_sel(cCmpIndexRes0123, cCmpIndexRes0123_1, cCmpVecBoth);
			return spu_extract(cCmpIndexResBoth, 0);
		}

//---------------------------------------------cache line copy op-------------------------------------------

		//copies one cache line (both 128 byte aligned)
		__attribute__((always_inline))
		inline void CopyCacheLine(vec_uint4* const __restrict pDest, const vec_uint4* const __restrict cpSrc)
		{
			const vec_uint4 cSrc0 = cpSrc[0];
			const vec_uint4 cSrc1 = cpSrc[1];
			const vec_uint4 cSrc2 = cpSrc[2];
			const vec_uint4 cSrc3 = cpSrc[3];
			const vec_uint4 cSrc4 = cpSrc[4];
			const vec_uint4 cSrc5 = cpSrc[5];
			const vec_uint4 cSrc6 = cpSrc[6];
			const vec_uint4 cSrc7 = cpSrc[7];
			pDest[0] = cSrc0;
			pDest[1] = cSrc1;
			pDest[2] = cSrc2;
			pDest[3] = cSrc3;
			pDest[4] = cSrc4;
			pDest[5] = cSrc5;
			pDest[6] = cSrc6;
			pDest[7] = cSrc7;
		}

		//copies one cache line to 2 destinations (both 128 byte aligned)
		__attribute__((always_inline))
		inline void CopyCacheLine2Dest(vec_uint4* const __restrict pDest0, vec_uint4* const __restrict pDest1, const vec_uint4* const __restrict cpSrc)
		{
			const vec_uint4 cSrc0 = cpSrc[0];
			const vec_uint4 cSrc1 = cpSrc[1];
			const vec_uint4 cSrc2 = cpSrc[2];
			const vec_uint4 cSrc3 = cpSrc[3];
			const vec_uint4 cSrc4 = cpSrc[4];
			const vec_uint4 cSrc5 = cpSrc[5];
			const vec_uint4 cSrc6 = cpSrc[6];
			const vec_uint4 cSrc7 = cpSrc[7];
			pDest0[0] = cSrc0;
			pDest1[0] = cSrc0;			
			pDest0[1] = cSrc1;
			pDest1[1] = cSrc1;			
			pDest0[2] = cSrc2;
			pDest1[2] = cSrc2;			
			pDest0[3] = cSrc3;
			pDest1[3] = cSrc3;			
			pDest0[4] = cSrc4;
			pDest1[4] = cSrc4;
			pDest0[5] = cSrc5;
			pDest1[5] = cSrc5;
			pDest0[6] = cSrc6;
			pDest1[6] = cSrc6;
			pDest0[7] = cSrc7;
			pDest1[7] = cSrc7;
		}
	}
}

#endif//__SPU__
#endif //PS3
#endif //__CACHE_H
