////////////////////////////////////////////////////////////////////////////
//
//  CryEngine Source File.
//  Copyright (C), Crytek
// -------------------------------------------------------------------------
//  File name:   XenonThreadSampler.h
//  Version:     v1.00
//  Created:     14/02/2008 by Timur
//  Description: 
// -------------------------------------------------------------------------
//  History:
//
////////////////////////////////////////////////////////////////////////////

#include <StdAfx.h>
#include <ISystem.h>
#include <Cry_Color.h>

#ifdef XENON
#include <Xbdm.h>

#include "XenonThreadSampler.h"

static CXenonThreadSampler *g_pCurrentSampler = 0;

#define SAMPLE_INTERVAL_SECONDS  (1)

//////////////////////////////////////////////////////////////////////////
CXenonThreadSampler::CXenonThreadSampler()
{
	g_pCurrentSampler = this;
	m_monitor_session = 0;

	ZeroStruct(m_samples);

	ZeroStruct(m_rawSamplesBuffer1);
	ZeroStruct(m_rawSamplesBuffer2);
	m_pSamplingBufferPtr = m_rawSamplesBuffer1;

	m_lastSnapshotTime = 0;
	
	m_nContextSwitches[0] = 0; m_nContextSwitches[1] = 0; m_nContextSwitches[2] = 0;
	m_nContextSwitches[3] = 0; m_nContextSwitches[4] = 0; m_nContextSwitches[5] = 0;
	ZeroStruct(m_nLastContextSwitches);
	m_lastContextSwitch = 0;

	m_nFirstReferenceIndex = 0;

	LARGE_INTEGER TicksPerSecond;
	QueryPerformanceFrequency( &TicksPerSecond );
	m_ticksPerSecond = TicksPerSecond.QuadPart;

	MonitorThreadSwitch(true);
}

//////////////////////////////////////////////////////////////////////////
CXenonThreadSampler::~CXenonThreadSampler()
{
	MonitorThreadSwitch(false);
}

//////////////////////////////////////////////////////////////////////////
static DWORD __stdcall ThreadSwitchCallback( ULONG dwNotification,DWORD dwParam )
{
	int64 time = CryGetTicks();
	int HwThread = GetCurrentProcessorNumber();
	//if (HwThread == 1)
	{

	DMN_THREADSWITCH* ThreadswitchData = (DMN_THREADSWITCH*)dwParam;

	g_pCurrentSampler->m_nContextSwitches[HwThread]++;

	CXenonThreadSampler::ThreadSwitchSample *pSampleBuffer = (CXenonThreadSampler::ThreadSwitchSample*)g_pCurrentSampler->m_pSamplingBufferPtr;
	int index =  (CryInterlockedIncrement(&g_pCurrentSampler->m_lastContextSwitch) - 1) % CXenonThreadSampler::MAX_SAMPLES;
	pSampleBuffer[index].time = time;
	pSampleBuffer[index].hwThread = HwThread;
	pSampleBuffer[index].fromThreadId = ThreadswitchData->OldThreadId;
	pSampleBuffer[index].toThreadId = ThreadswitchData->NewThreadId;
	}

	return 0;
}

//////////////////////////////////////////////////////////////////////////
bool CXenonThreadSampler::MonitorThreadSwitch( bool bEnable )
{
	if (bEnable && m_monitor_session == 0)
	{
		bool success = true;

		// open the session...
		HRESULT hr = DmOpenNotificationSession(0, (PDMN_SESSION*)&m_monitor_session);
		if (hr != XBDM_NOERR)
		{
			return false;
		}
		// register notification handler...
		hr = DmNotify( (PDMN_SESSION)m_monitor_session, DM_THREADSWITCH, ThreadSwitchCallback);
		if (hr != XBDM_NOERR)
		{
			return false;
		}
	}
	else if (m_monitor_session != 0)
	{
		DmCloseNotificationSession( (PDMN_SESSION)m_monitor_session );
		m_monitor_session = 0;
	}
	return true;

}

//////////////////////////////////////////////////////////////////////////
bool CXenonThreadSampler::MakeSnapshot( SnapshotInfo &snapshotInfo )
{
	int i;
	int scale = 2;
	int64 period = m_ticksPerSecond*scale/2;

	int64 currentTime = CryGetTicks();

	snapshotInfo.nProcessorCount = MAX_HW_THREADS;

	// Do it every 2 seconds.
	if ((currentTime-m_lastSnapshotTime) < SAMPLE_INTERVAL_SECONDS*m_ticksPerSecond)
	{
		memcpy_s( snapshotInfo.pContextSwitches,sizeof(snapshotInfo.pContextSwitches),(void*)m_nLastContextSwitches,sizeof(m_nLastContextSwitches) );
		snapshotInfo.nProcessorCount = MAX_HW_THREADS;
		return true;
	}
	m_lastSnapshotTime = currentTime;

	memcpy_s( snapshotInfo.pContextSwitches,sizeof(snapshotInfo.pContextSwitches),(void*)m_nContextSwitches,sizeof(m_nContextSwitches) );
	memcpy_s( m_nLastContextSwitches ,sizeof(m_nLastContextSwitches ),(void*)m_nContextSwitches,sizeof(m_nContextSwitches) );

	//Reset context switches
	m_nContextSwitches[0] = 0; m_nContextSwitches[1] = 0; m_nContextSwitches[2] = 0;
	m_nContextSwitches[3] = 0; m_nContextSwitches[4] = 0; m_nContextSwitches[5] = 0;

	int nLastContextSwitch = m_lastContextSwitch;
	if (nLastContextSwitch > MAX_SAMPLES)
		nLastContextSwitch = MAX_SAMPLES;

	ThreadSwitchSample *rawSamples = 0;
	// Swap current buffer.
	if (m_pSamplingBufferPtr == m_rawSamplesBuffer1)
	{
		rawSamples = m_rawSamplesBuffer1;
		m_pSamplingBufferPtr = m_rawSamplesBuffer2;
		m_lastContextSwitch = 0;
	}
	else
	{
		rawSamples = m_rawSamplesBuffer2;
		m_pSamplingBufferPtr = m_rawSamplesBuffer1;
		m_lastContextSwitch = 0;
	}

	// Find min/max time.
	int minTimeIndex = 0;
	int64 minTime = rawSamples[0].time;
	int64 maxTime = rawSamples[0].time;
	for (i = 0; i < nLastContextSwitch; i++)
	{
		if (rawSamples[i].time < minTime && rawSamples[i].time != 0)
		{
			minTime = rawSamples[i].time;
			minTimeIndex = i;
		}
		if (rawSamples[i].time > maxTime)
		{
			maxTime = rawSamples[i].time;
		}
	}

	m_referenceTime = maxTime - period;

	ZeroStruct(nSamplesPerThread);

	int64 timeSum = 0;
	int64 timeSumAll = 0;

	// Accumulate samples per HW threads , sorted with time
	for (i = minTimeIndex; i < nLastContextSwitch; i++)
	{
		if (i > 0)
			timeSumAll += rawSamples[i].time - rawSamples[i-1].time;
		int hwThread = rawSamples[i].hwThread;
		if (rawSamples[i].time > m_referenceTime)
		{
			m_samples[hwThread][ nSamplesPerThread[hwThread]++ ] = rawSamples[i];
			if (i > 0)
				timeSum += rawSamples[i].time - rawSamples[i-1].time;
		}
	}
	for (i = 0; i < minTimeIndex; i++)
	{
		if (i > 0)
			timeSumAll += rawSamples[i].time - rawSamples[i-1].time;
		int hwThread = rawSamples[i].hwThread;
		if (rawSamples[i].time > m_referenceTime)
		{
			m_samples[hwThread][ nSamplesPerThread[hwThread]++ ] = rawSamples[i];
			if (i > 0)
				timeSum += rawSamples[i].time - rawSamples[i-1].time;
		}
	}

	return true;
}

static uint32 processor_colors[6] = 
{
	RGBA8(255,0,0,0xFF),
	RGBA8(255,150,0,0xFF),
	RGBA8(0,255,0,0xFF),
	RGBA8(200,255,0,0xFF),
	RGBA8(0,0,255,0xFF),
	RGBA8(0,150,255,0xFF),
};

//////////////////////////////////////////////////////////////////////////
void CXenonThreadSampler::CreateSpanListForThread( uint32 processId,uint32 threadId,std::vector<Span>& spans,uint32 width,uint32 scale,uint32* totalTime,int *ProcessorId,uint32 *color )
{
	if (!m_ticksPerSecond)
		return;

	int hwThread = 0;
	if (threadId != -1)
	{
		DM_THREADINFOEX threadInfo;
		threadInfo.Size = sizeof(threadInfo);
		DmGetThreadInfoEx( threadId,&threadInfo );
		hwThread = threadInfo.CurrentProcessor % 6;
	}
	else {
		threadId = 0;
		if (processId >= 0)
			hwThread = processId;
	}

	*ProcessorId = hwThread;
	*color = processor_colors[hwThread];

	int64 count = 0;

	int i = 0;

	int nAccumulatedSamplesPerHwThread = nSamplesPerThread[hwThread];

	int64 period = m_ticksPerSecond*scale/2;

	for (i = 0; i < nAccumulatedSamplesPerHwThread-1; i++ )
	{
		if (m_samples[hwThread][i].toThreadId == threadId && m_samples[hwThread][i+1].fromThreadId == threadId)
		{
			//assume i+1 sample is an end of execution of current thread

			int64 start = m_samples[hwThread][i].time - m_referenceTime;
			int64 end = m_samples[hwThread][i+1].time - m_referenceTime;

			if (end < start)
			{
				continue;
			}

			if (start > m_ticksPerSecond*scale/2)
			{
				break;
			}
			if (end > m_ticksPerSecond*scale/2)
			{
				end = m_ticksPerSecond*scale/2;
			}

			count += end - start;

			start = start*width*2/(m_ticksPerSecond*scale);
			end = end*width*2/(m_ticksPerSecond*scale);

			if (start>=width)
				break;

			if (end == start)
				end++;

			if (end > width-1)
				end = width-1;

			//try to merge with previous span
			if (!spans.empty())
			{
				if (spans[spans.size()-1].end >= (uint16)start)
				{
					spans[spans.size()-1].end = (uint16)end;
				}
				else
				{
					Span span;
					span.start = (uint16)start;
					span.end = (uint16)end;
					spans.push_back(span);
				}
			}
			else
			{
				Span span;
				span.start = (uint16)start;
				span.end = (uint16)end;
				spans.push_back(span);
			}
		}
	}

	int64 s = m_ticksPerSecond*scale/2;
	*totalTime = (uint32)((count*1000+s/2)/s);
}

inline bool SortThreadsFunction( std::pair<int,int> thread1,std::pair<int,int> thread2 )
{
	return thread1.first < thread2.first;
}

//////////////////////////////////////////////////////////////////////////
void CXenonThreadSampler::EnumerateThreads( int nProcessId )
{
	DWORD numThreads;
	DWORD pThreads[256];
	numThreads = sizeof(pThreads);
	if (XBDM_NOERR == DmGetThreadList( pThreads,&numThreads ))
	{
		// Sort threads by core
		std::vector< std::pair<int,int> > all_threads;
		all_threads.resize(numThreads);

		for (uint32 i = 0; i < numThreads; i++)
		{
			DM_THREADINFOEX threadInfo;
			threadInfo.Size = sizeof(threadInfo);
			DmGetThreadInfoEx( pThreads[i],&threadInfo );
			all_threads[i].first = threadInfo.CurrentProcessor;
			all_threads[i].second = pThreads[i];
		}
		std::stable_sort( all_threads.begin(), all_threads.end(), SortThreadsFunction );

		threads.resize(numThreads);
		for (uint32 i = 0; i < numThreads; i++)
			threads[i] = all_threads[i].second;
	}
}

#endif //XENON