#ifndef __PERFORMANCE_CLOCK_H
#define __PERFORMANCE_CLOCK_H

#include <stdint.h>

#if __PPU__
/**
 * E Here is the PPU implementation of the performance clock.
 *
 * Usage:
 * 1. Initialize one or more clock with: 
 *    CLOCK(CLOCK_NAME, "Clock Description");
 *  e.g. CLOCK(FULL_FRAME, "Full frame time");
 * 2. Measure time with:
 *    CLOCK_START(CLOCK_NAME);
 *     ... (your code here)
 *    CLOCK_END(CLOCK_NAME);
 *  e.g. 
 *    CLOCK_START(FULL_FRAME);
 *     Update, draw, etc;
 *     swapBuffers();
 *    CLOCK_END(FULL_FRAME);
 * 3. Report (and optionally reset back to zero) clock occasionally.
 *    CLOCK_REPORT_AND_RESET(CLOCK_NAME, fReportScale);
 *    e.g.
 *     if (iFrame++ % 100 == 0)
 *        CLOCK_REPORT_AND_RESET(FULL_FRAME, 0.01);
 *    The 0.01 will divide all numbers by 100.  Since we are reporting
 *    every 100 frames, dividing by 100 will give average per-frame time.
 *    If you don't want to reset the clock use:
 *        CLOCK_REPORT(FULL_FRAME, 1.0f / iFrame);
 *    You can also use CLOCK_READ(CLOCK_NAME, fReportScale); to retrieve
 *    the current value of the clock as a double.
 *
 * Note 1: You can use CLOCK_CONSOLE_REPORT/CLOCK_CONSOLE_REPORT_AND_RESET
 *    to output results to a different logical console window.  This is good
 *    if you want to separate timing from all other printf's.
 * Note 2: If you want to share a clock across source files, use:
 *    EXTERN_CLOCK(CLOCK_NAME);
 *    to specify that a clock has been declared elsewhere.
 *
 */

#include "clock.h"

#include <sys/adhoc.h>
#include <stdio.h>
#include <stdarg.h>

inline void _CLOCK_CONSOLE_WRITE(console_t cid, const char *pcFormat, ...);

typedef struct performanceClockData {
	uint64_t ulTotal, ulCurrent, ulMax, ulMin;
	const char *pcName;
} performanceClockData;

/**
 * E Note:
 * I appreciate that these macros are a little hard to read.  The reason is
 * that multi-line macros (that end in \) can often become a problem when files
 * are moved back and forth between a windows environment and a linux
 * environment, with CR-LF inconsistencies causing problems.
 */

// E This macro defines a clock with a given handle and a text description.
#define CLOCK(handle, name) performanceClockData perfClock##handle##Data={0, 0, 0, 0, name}

// E This macro basically says that the specified clock is defined in some other
// E source file (i.e. extern clock)
#define EXTERN_CLOCK(handle) extern performanceClockData perfClock##handle##Data;
// E Starts timing this clock
#define CLOCK_START(handle) perfClock##handle##Data.ulCurrent=clockInitEx();

// E Ends timing this clock
#define CLOCK_END(handle) do {uint64_t ulIncrement=clockCyclesEx(perfClock##handle##Data.ulCurrent); perfClock##handle##Data.ulTotal+=ulIncrement; if (perfClock##handle##Data.ulMax==0 || perfClock##handle##Data.ulMin>ulIncrement) perfClock##handle##Data.ulMin=ulIncrement; if (perfClock##handle##Data.ulMax<ulIncrement) perfClock##handle##Data.ulMax=ulIncrement; } while(0)

// E Resets the clock.
#define CLOCK_RESET(handle) do {perfClock##handle##Data.ulTotal=perfClock##handle##Data.ulMax=perfClock##handle##Data.ulMin=0;} while (0)

// E Reports the clock, but doesn't reset it.
// E Scale can be used to multiply the time elapsed so as to get an average time
// E For example, if you call CLOCK_REPORT every 100 frames, you can set scale
// E to 0.01f so that the numbers end up being per-frame numbers instead of
// E aggregate numbers over 100 frames.  This allows you to easily change the
// E sampling rate and still get numbers that are consistent with each other.
#define CLOCK_REPORT(handle, scale) do {static float fFrequency=clockGetFrequency(); printf("%25s: %5.2fms, min=%5.2fms, max=%5.2fms (%.3f/sec)\n", perfClock##handle##Data.pcName, (float) (perfClock##handle##Data.ulTotal) * scale * 1000.0f / fFrequency, (float) (perfClock##handle##Data.ulMin) * 1000.0f / fFrequency, (float) (perfClock##handle##Data.ulMax) * 1000.0f / fFrequency, (float) fFrequency / (float) ((perfClock##handle##Data.ulTotal) * scale)); } while (0)

// E Reports the clock and then resets it.
#define CLOCK_REPORT_AND_RESET(handle, scale) do { CLOCK_REPORT(handle,scale); CLOCK_RESET(handle); } while (0)

// E Returns the value of the current clock without resetting it.
#define CLOCK_READ(handle, scale) _perfClockRead(&perfClock##handle##Data, scale)
inline float _perfClockRead(performanceClockData *pData, float fScale)
	 __attribute__ ((always_inline));

inline float _perfClockRead(performanceClockData *pData, float fScale) {
	static float fFrequencyScale=1000.0f / clockGetFrequency();
	return (float) pData->ulTotal * fScale * fFrequencyScale;
}

// E Reports the clock to a specified logical console
#define CLOCK_CONSOLE_REPORT(console, handle, scale) do {static float fFrequency=clockGetFrequency(); _CLOCK_CONSOLE_WRITE(console, "%25s: %5.2fms, min=%5.2fms, max=%5.2fms (%.3f/sec)\n", perfClock##handle##Data.pcName, (float) (perfClock##handle##Data.ulTotal) * scale * 1000.0f / fFrequency, (float) (perfClock##handle##Data.ulMin) * 1000.0f / fFrequency, (float) (perfClock##handle##Data.ulMax) * 1000.0f / fFrequency, (float) fFrequency / (float) ((perfClock##handle##Data.ulTotal) * scale));} while (0)

// E Reports the clock to a specified logical console and resets it.
#define CLOCK_CONSOLE_REPORT_AND_RESET(console, handle, scale) do { CLOCK_CONSOLE_REPORT(console, handle,scale); CLOCK_RESET(handle); } while (0)

inline void _CLOCK_CONSOLE_WRITE(console_t cid, const char *pcFormat, ...) {
	static char acBuffer[4096];
	char *pcLooper;
	va_list args;

	va_start(args, pcFormat);
	vsnprintf(acBuffer, 4096, pcFormat, args);
	va_end(args);

	for (pcLooper=acBuffer;*pcLooper;pcLooper++)
		logical_console_putc(cid, *pcLooper);
}

// E PPU does nothing for this function, it is only needed by the SPU
#define CLOCK_RESET_TIMER()







#elif __SPU__
/**
 * E Here is the SPU implementation of the performance clock.
 *
 * The usage is almost exactly the same as on the PPU, however, you must
 * remember to call CLOCK_RESET_TIMER() occasionally, but not while any
 * clocks are being timed (i.e. not between a CLOCK_START and CLOCK_END call).
 * The SPU clock uses the decrementer, and CLOCK_RESET_TIMER() sets the
 * decrementer to a large value.  If you do not call it, the decrementer will
 * wrap around and your timing will be incorrect.
 *
 * See PPU comments above for details, but in summary:
 *
 * 0. CLOCK_RESET_TIMER(); (must do this initially)
 * 1. CLOCK(CLOCK_NAME, "Clock description");
 * 2. CLOCK_START(CLOCK_NAME);
 *    ... code to measure
 *    CLOCK_END(CLOCK_NAME);
 * 3. CLOCK_REPORT_AND_RESET(CLOCK_NAME, fScale);
 * 4. if (new frame started or a long time has elapsed) CLOCK_RESET_TIMER();
 *
 * Note 1:
 *   CLOCK_REPORT/CLOCK_REPORT_AND_RESET assume that you have spu_printf
 *    working
 * Note 2:
 *   There is no CLOCK_CONSOLE_REPORT/CLOCK_CONSOLE_REPORT_AND_RESET support
 *   on the SPU
 *
 */

typedef struct spuPerformanceClockData {
	uint64_t ulTotal, ulCurrent;
	const char *pcName;
} spuPerformanceClockData;

// E Please remember to call sys_time_get_timebase_frequency() on the PPU
// E to confirm this clock speed.
#define SPU_ASSUMED_TIMEBASE_FREQUENCY (80 * 1000 * 1000)

#include <cell/mfc_io.h>
#include <spu_printf.h>


/**
 * E Note:
 * I appreciate that these macros are a little hard to read.  The reason is
 * that multi-line macros (that end in \) can often become a problem when files
 * are moved back and forth between a windows environment and a linux
 * environment, with CR-LF inconsistencies causing problems.
 */

// E This macro defines a clock with a given handle and a text description.
#define CLOCK(handle, name) spuPerformanceClockData perfClock##handle##Data={0, 0, name}

// E This macro basically says that the specified clock is defined in some other
// E source file (i.e. extern clock)
#define EXTERN_CLOCK(handle) extern spuPerformanceClockData perfClock##handle##Data;
// E Starts timing this clock
//#define CLOCK_START(handle) do {asm volatile ("# starting clock " #handle ); perfClock##handle##Data.ulCurrent=spu_read_decrementer(); } while (0)
#define CLOCK_START(handle) perfClock##handle##Data.ulCurrent=spu_read_decrementer()

// E Ends timing this clock
//#define CLOCK_END(handle) do {perfClock##handle##Data.ulTotal+=perfClock##handle##Data.ulCurrent-spu_read_decrementer(); asm volatile ("# ending clock " #handle );  } while (0)
#define CLOCK_END(handle) perfClock##handle##Data.ulTotal+=perfClock##handle##Data.ulCurrent-spu_read_decrementer()

// E Resets the clock.
#define CLOCK_RESET(handle) perfClock##handle##Data.ulTotal=0

// E You need to do this once in a while to make sure your counter doesn't overflow
#define CLOCK_RESET_TIMER() spu_write_decrementer(0xffffffffU)

// E Reports the clock, but doesn't reset it.
// E Scale can be used to multiply the time elapsed so as to get an average time
// E For example, if you call CLOCK_REPORT every 100 frames, you can set scale
// E to 0.01f so that the numbers end up being per-frame numbers instead of
// E aggregate numbers over 100 frames.  This allows you to easily change the
// E sampling rate and still get numbers that are consistent with each other.
#define CLOCK_REPORT(handle, scale) do {static float fFrequency=SPU_ASSUMED_TIMEBASE_FREQUENCY; spu_printf("%25s: %5.2fms (%.3f/sec)\n", perfClock##handle##Data.pcName, (float) (perfClock##handle##Data.ulTotal) * scale * 1000.0f / fFrequency, (float) fFrequency / (float) ((perfClock##handle##Data.ulTotal) * scale)); } while (0)

// E Returns the value of the current clock without resetting it.
#define CLOCK_READ(handle, scale) _perfClockRead(&perfClock##handle##Data, scale)

inline float _perfClockRead(performanceClockData *pData, float fScale)
	 __attribute__ ((always_inline)) {
	static float fFrequencyScale=1000.0f / clockGetFrequency();
	return (float) pData->ulTotal * fScale * fFrequencyScale;
}

// E Reports the clock and then resets it.
#define CLOCK_REPORT_AND_RESET(handle, scale) do { CLOCK_REPORT(handle,scale); CLOCK_RESET(handle); } while (0)

// E There is no version of these for the SPU, they will call the default version
#define CLOCK_CONSOLE_REPORT(console, handle, scale) CLOCK_REPORT(handle, scale)
#define CLOCK_CONSOLE_REPORT_AND_RESET(console, handle, scale) CLOCK_REPORT_AND_RESET(handle, scale)

#else

// E Neither PPU or SPU over here...
// E This could be adapted to run on a different platform or a different O/S

#define CLOCK(handle, name)
#define EXTERN_CLOCK(handle)
#define CLOCK_START(handle)
#define CLOCK_END(handle)
#define CLOCK_RESET(handle)
#define CLOCK_RESET_TIMER()
#define CLOCK_REPORT(handle, scale)
#define CLOCK_REPORT_AND_RESET(handle, scale)
#define CLOCK_CONSOLE_REPORT(console, handle, scale)
#define CLOCK_CONSOLE_REPORT_AND_RESET(console, handle, scale)

#endif

#endif
