/* 
	memcpy function wrappers
*/

#ifndef __MEMORY_H
#define __MEMORY_H
#pragma once

#if defined(PS3)

#if defined(__SPU__)

#include <cell/dma.h>
#include "SPU.h"
#include <IJobManSPU.h>
#if !defined(_SPU_JOB)
	#include "JobStructs.h"
#endif//_SPU_JOB

namespace NSPU
{
#if !defined(_SPU_JOB)
	namespace NDriver
	{
		extern SInfoBlock g_sInfoBlock;
	}
#endif
	//clears tag update required before any DMA sync
	#define MFC_CLEAR_TAG_UPDATE {spu_writech(MFC_WrTagUpdate,0);	do {} while(spu_readchcnt(MFC_WrTagUpdate) == 0);	spu_readch(MFC_RdTagStat);}

		//definitions for lock line lost reserv.events
	#define MFC_LLAR_LOST_EVENT (1<<10)
	#define MFC_RD_EVENT_STATUS 0
	#define MFC_WR_EVENT_ACK    2
		//fast enable/disabling of LL events, phantom events can occur (should not matter)
		//keep in sync with SPUMultiThread.h
	#define FAST_UNSAFE_LL_ENABLE

	//synchronizes DMA transfers with a certain tag id
	__attribute__((always_inline))
	inline void SyncMemory(const unsigned int cTagID)
	{
		//MFC_CLEAR_TAG_UPDATE
#if !defined(MFC_SYNC_BY_POLLING)
		spu_writech(MFC_WrTagMask, (1<<cTagID));
		spu_writech(MFC_WrTagUpdate,MFC_TAG_UPDATE_ALL);
		spu_readch(MFC_RdTagStat);
#else
		//set tag update and poll for completion 
		const unsigned int cTagMask = (1<<cTagID);
		spu_writech(MFC_WrTagMask, cTagMask);
	#if defined(MEASURE_TIMEOUT)
		volatile unsigned long long counter = 0;
	#endif
		do 
		{
	#if defined(MEASURE_TIMEOUT)
			counter = counter + 1;
			if(counter > 400000)
			{
	#if defined(_SPU_JOB)
				printf("DMA-Timeout in SPUDriver for tag=%d\n", cTagID);
	#else
				printf("DMA-Timeout in SPUDriver for tag=%d  job ID=%d\n", cTagID, NSPU::NDriver::g_sInfoBlock.jobId);
	#endif//_SPU_JOB
				SPU_DEBUG_HALT;
			}
	#endif	//MEASURE_TIMEOUT
		} 
		while(__builtin_expect(spu_mfcstat(MFC_TAG_UPDATE_IMMEDIATE) != cTagMask, false));
#endif //MFC_SYNC_BY_POLLING
	}

	//copy large amounts (>16 KByte) of data from MAIN memory into LS
	__attribute__((always_inline))	
	inline void MemcpyLargeLS(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID, const bool cFenced)
	{
//		assert(cSize > 16*1024);
		assert((cSize & 0xF) == 0);
		assert(((unsigned int)dest & ~0xF) == (unsigned int)dest);//must be on a quadword boundary
		assert((cSource & 0xF) == 0);//must be on a quadword boundary
		int sizeLeft = (int)cSize;
		unsigned int curDest = (unsigned int)dest;
		si_wrch(MFC_TagID,si_from_uint(cTagID));//issue only once
		const unsigned int cCmd = cFenced? MFC_GETF_CMD : MFC_GET_CMD;
		unsigned int curSource = cSource;
		do
		{
			si_wrch(MFC_LSA,si_from_uint(curDest));
			si_wrch(MFC_EAL,si_from_uint(curSource));
			si_wrch(MFC_Size,si_from_uint((sizeLeft>16*1024)?16*1024 : sizeLeft));
			si_wrch(MFC_Cmd,si_from_uint(cCmd));//initiate transfer
			sizeLeft	-= 16*1024;//transfer 16 KB-wise
			curSource	+= 16*1024;
			curDest		+= 16*1024;
		}
		while(sizeLeft > 0);
	}

	__attribute__((always_inline))	
	inline void MemcpyLargeLS(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyLargeLS dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", dest, cSource, cSize, cTagID);
//		cellDmaLargeGet(dest, cSource, cSize, cTagID, 0, 0);	
		MemcpyLargeLS(dest, cSource, cSize, cTagID, false);
	}

	__attribute__((always_inline))	
	inline void MemcpyLargeLSFenced(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyLargeLSFenced dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", dest, cSource, cSize, cTagID);
//		cellDmaLargeGetf(dest, cSource, cSize, cTagID, 0, 0);
		MemcpyLargeLS(dest, cSource, cSize, cTagID, true);
	}

	//copy data from MAIN memory into LS, no debug printf
	__attribute__((always_inline))
	inline void MemcpyLSNoDebug(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		//implements: mfc_get(dest, cSource, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(dest));
		si_wrch(MFC_EAL,si_from_uint(cSource));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));//initiate transfer
	}

	//copy data from MAIN memory into LS
	__attribute__((always_inline))
	inline void MemcpyLS(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyLS dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", dest, cSource, cSize, cTagID);
		assert((cSource & 0xF) == 0);//must be on a quadword boundary
		assert(((unsigned int)dest & 0xF) == 0);//must be on a quadword boundary
		//implements: mfc_get(dest, cSource, cSize, cTagID, 0, 0);(without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(dest));
		si_wrch(MFC_EAL,si_from_uint(cSource));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_GET_CMD));//initiate transfer
	}

	//copy data from MAIN memory into LS
	__attribute__((always_inline))
	inline void MemcpyLSFenced(volatile TAddrLS dest, const unsigned int cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyLSFenced dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", dest, cSource, cSize, cTagID);
		assert((cSource & 0xF) == 0);//must be on a quadword boundary
		assert(((unsigned int)dest & 0xF) == 0);//must be on a quadword boundary
		//implements: mfc_getf(dest, cSource, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(dest));
		si_wrch(MFC_EAL,si_from_uint(cSource));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_GETF_CMD));//initiate transfer
	}

/*	__attribute__((always_inline))
	inline void MemcpyLSList
	(
		volatile void* cpDest, 
		const unsigned int cEAHigh, 
		const unsigned int cEALow, 
		const unsigned int cSize,
		const unsigned int cTagID, 
		const unsigned int cMFCCommand
	)
	{
		DEBUG_PRINTF("MemcpyLSList LS source: 0x%x, elems: %d, tagID: %d\n", cEALow, (cSize >> 3), cTagID);
		assert(((cSize >> 3) << 3) == cSize);//must be a multiple of sizeof(CellDmaListElement) = 8
		assert(cEALow < 256 * 1024);//must be a valid LS address
		spu_mfcdma64(cpDest, cEAHigh, cEALow, cSize, cTagID, cMFCCommand);
	}

	//copy large amounts of data from LS into MAIN memory
	__attribute__((always_inline))
	inline void MemcpyLargeMain(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyLargeMain dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cSource & ~0xF) == (unsigned int)cSource);//must be on a quadword boundary
		assert((cDest & 0xF) == 0);//must be on a quadword boundary
		cellDmaLargePut(cSource, cDest, cSize, cTagID, 0, 0);
	}
*/
	// wrapper for copy data from LS into MAIN memory
	__attribute__((always_inline))
	inline void MemcpyMain(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyMain dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cSource & 0xF) == 0);//must be on a quadword boundary
		assert((cDest & 0xF) == 0);//must be on a quadword boundary
		//implements: mfc_put((volatile void*)(uintptr_t)cSource, cDest, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(cSource));
		si_wrch(MFC_EAL,si_from_uint(cDest));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUT_CMD));//start asynchronous transfer back
	}

	// wrapper for copy data from LS into MAIN memory, fenced version
	__attribute__((always_inline))
	inline void MemcpyMainFenced(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyMainFenced dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cSource & 0xF) == (cDest & 0xF));//must be on a quadword boundary
		//implements: mfc_putf((volatile void*)(uintptr_t)cSource, cDest, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(cSource));
		si_wrch(MFC_EAL,si_from_uint(cDest));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUTF_CMD));//start asynchronous transfer back
	}

	__attribute__((always_inline))
	inline void MemcpyMainBarrier(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpyMainBarrier dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cSource & 0xF) == 0);//must be on a quadword boundary
		assert((cDest & 0xF) == 0);//must be on a quadword boundary
		//implements: mfc_putb((volatile void*)(uintptr_t)cSource, cDest, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(cSource));
		si_wrch(MFC_EAL,si_from_uint(cDest));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUTB_CMD));//start asynchronous transfer back
	}

	__attribute__((always_inline))
	// wrapper for copy small amount of data (<16 byte) from LS into MAIN memory
	inline void MemcpySmallMainBarrier(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpySmallMainBarrier dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cDest & 0xF) == ((unsigned int)cSource & 0xF));
		assert(cSize == 1 || cSize == 2 || cSize == 4 || cSize == 8);
		//implements: mfc_putb((volatile void*)(uintptr_t)cSource, cDest, cSize, cTagID, 0, 0) (without $ch17)
		si_wrch(MFC_LSA,si_from_ptr(cSource));
		si_wrch(MFC_EAL,si_from_uint(cDest));
		si_wrch(MFC_Size,si_from_uint(cSize));
		si_wrch(MFC_TagID,si_from_uint(cTagID));
		si_wrch(MFC_Cmd,si_from_uint(MFC_PUTB_CMD));//start asynchronous transfer back
	}
/*
	// wrapper for copy small amount of data (<16 byte) from LS into MAIN memory
	static void MemcpySmallMain(const unsigned int cDest, const volatile TAddrLS cSource, const unsigned int cSize, const unsigned int cTagID)
	{
		DEBUG_PRINTF("MemcpySmallMain dest: 0x%x, source: 0x%x, size: %d, tagID: %d\n", cDest, cSource, cSize, cTagID);
		assert(((unsigned int)cDest & 0xF) == ((unsigned int)cSource & 0xF));
		switch(cSize)
		{
		case 0:
		case 1:
		case 2:
		case 4:
			mfc_put((volatile void*)(uintptr_t)cSource, cDest, cSize, cTagID, 0, 0);
			break;
		case 6:
			assert(((unsigned int)cDest & 0x1) == 0);
			mfc_put((volatile void*)(uintptr_t)cSource, cDest, 2, cTagID, 0, 0);
			mfc_put((volatile void*)(uintptr_t)((unsigned int)cSource+2), cDest+2, 2, cTagID, 0, 0);
			mfc_put((volatile void*)(uintptr_t)((unsigned int)cSource+4), cDest+4, 2, cTagID, 0, 0);
			break;
		case 8:
			assert(((unsigned int)cDest & 0x3) == 0);
			if((cDest & 0x7) != 0)
			{
				mfc_put((volatile void*)(uintptr_t)cSource, cDest, 4, cTagID, 0, 0);
				mfc_put((volatile void*)(uintptr_t)((unsigned int)cSource+4), cDest+4, 4, cTagID, 0, 0);
			}
			else
				mfc_put((volatile void*)(uintptr_t)cSource, cDest, 8, cTagID, 0, 0);
			break;
		case 12:
			assert(((unsigned int)cDest & 0x3) == 0);
			mfc_put((volatile void*)(uintptr_t)cSource, cDest, 4, cTagID, 0, 0);
			mfc_put((volatile void*)(uintptr_t)((unsigned int)cSource+4), cDest+4, 2, cTagID, 0, 0);
			mfc_put((volatile void*)(uintptr_t)((unsigned int)cSource+8), cDest+8, 2, cTagID, 0, 0);
			break;
		default:
			assert(cSize == 0 || cSize == 1 || cSize == 2 || cSize == 4 || cSize == 6 || cSize == 8 || cSize == 12);
		}
	}
*/
}//NSPU

//prepares the mfc_getllar command without issuing it
__attribute__((always_inline))
inline void mfc_getllar_prep(volatile void* const pLS, const unsigned int cEAAddr)
{
	si_wrch(MFC_LSA,si_from_ptr(pLS));
	si_wrch(MFC_EAL,si_from_uint(mfc_ea2l(cEAAddr)));
//	si_wrch(MFC_Size,si_from_uint(128));
//	si_wrch(MFC_TagID,si_from_uint(0));
}

//atomic functions which follow a call to mfc_getllar or mfc_putllc using the same parameters (keep channels)
//they just issue the command reusing all set registers from the previous calls
__attribute__((always_inline))
inline void mfc_getllar_again()
{
	si_wrch(MFC_Cmd, si_from_uint(208));
}

__attribute__((always_inline))
inline void mfc_putllc_again()
{
	si_wrch(MFC_Cmd, si_from_uint(180));
}

#endif //__SPU__
#endif //PS3
#endif //__MEMORY_H
