// Utility for analyzing a memory allocation trace created by the MTrace
// memory profiler.

#include "MTraceDump.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <assert.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>

#include <map>
#include <vector>
#include <string>
#include <algorithm>

const char *MemOpFnName(MemOpFn fn)
{
	switch (fn)
	{
	case FN_calloc: return "calloc";
	case FN_malloc: return "malloc";
	case FN_free: return "free";
	case FN_realloc: return "realloc";
	case FN_valloc: return "valloc";
	case FN_memalign: return "memalign";
	}
	return "<INVALID>";
}

uint32_t MemOp::FreedSize(const Profile &profile) const
{
	if (m_AllocId != ~0U)
	{
		const MemOp &allocOp = profile.GetOp(m_AllocId);
		return allocOp.Size();
	}
	else
		return 0;
}

bool MemOp::IsDescendantOf(const Profile &profile, unsigned nodeId) const
{
	const unsigned parent = m_Parent;

	if (parent == ~0U) return false;
	if (nodeId == parent) return true;
	return profile.GetNode(parent).IsDescendantOf(profile, nodeId);
}

bool MemOp::IsDescendantOf(
		const Profile &profile,
		const std::vector<bool> &nodeIds
		) const
{
	const unsigned parent = m_Parent;

	if (parent == ~0U) return false;
	if (parent < nodeIds.size() && nodeIds[parent]) return true;
	return profile.GetNode(parent).IsDescendantOf(profile, nodeIds);
}

void MemOp::Dump(
		const Profile &profile,
		FILE *out,
		unsigned depth
		) const
{
	for (unsigned i = 0; i < depth; ++i) fputs("  ", out);
	switch (m_Fn)
	{
	case FN_calloc:
		fprintf(out,
				"calloc(%u, %u) = 0x%08x",
				static_cast<unsigned>(m_Args[0]),
				static_cast<unsigned>(m_Args[1]),
				static_cast<unsigned>(m_Args[2]));
		break;
	case FN_malloc:
		fprintf(out,
				"malloc(%u) = 0x%08x",
				static_cast<unsigned>(m_Args[0]),
				static_cast<unsigned>(m_Args[1]));
		break;
	case FN_realloc:
		fprintf(out,
				"realloc(0x%08x, %u) = 0x%08x",
				static_cast<unsigned>(m_Args[0]),
				static_cast<unsigned>(m_Args[1]),
				static_cast<unsigned>(m_Args[2]));
		break;
	case FN_valloc:
		fprintf(out,
				"valloc(%u) = 0x%08x",
				static_cast<unsigned>(m_Args[0]),
				static_cast<unsigned>(m_Args[1]));
		break;
	case FN_memalign:
		fprintf(out,
				"memalign(%u, %u) = 0x%08x",
				static_cast<unsigned>(m_Args[0]),
				static_cast<unsigned>(m_Args[1]),
				static_cast<unsigned>(m_Args[2]));
		break;
	case FN_free:
		fprintf(out,
				"free(0x%08x)",
				static_cast<unsigned>(m_Args[0]));
		break;
	}
	if (m_Fn == FN_realloc || m_Fn == FN_free)
	{
		uint32_t freedSize = FreedSize(profile);
		if (freedSize != 0)
			fprintf(out, " [%u bytes]", static_cast<unsigned>(freedSize));
	}
	fputc('\n', out);
}

void MemOp::DumpStack(
		const Profile &profile,
		FILE *out,
		unsigned indent
		) const
{
	for (unsigned i = 0; i < indent; ++i) fputc(' ', out);
	Dump(profile, out, 0);
	for (unsigned i = 0; i < indent; ++i) fputc(' ', out);
	profile.DumpFrame(out, m_Addr, 0);
	if (m_Parent != ~0U)
		profile.GetNode(m_Parent).DumpStack(profile, out, 1, indent);
}

void MemOp::GetStack(
		const Profile &profile,
		std::vector<std::string> &stack
		) const
{
	const uint32_t addr = m_Addr;
	const CAddr &cAddr = profile.GetCAddr(addr);
	const char *desc = cAddr.Desc();

	if (desc == NULL)
	{
		char buffer[128];
		snprintf(buffer, sizeof buffer, "0x%08x", static_cast<unsigned>(addr));
		buffer[sizeof buffer - 1] = 0;
		stack.push_back(buffer);
	}
	else
		stack.push_back(desc);
	if (m_Parent != ~0U)
		profile.GetNode(m_Parent).GetStack(profile, stack);
}

bool Node::IsDescendantOf(const Profile &profile, unsigned nodeId) const
{
	const unsigned parent = m_Parent;

	if (parent == ~0U) return false;
	if (nodeId == parent) return true;
	return profile.GetNode(parent).IsDescendantOf(profile, nodeId);
}

bool Node::IsDescendantOf(
		const Profile &profile,
		const std::vector<bool> &nodeIds
		) const
{
	const unsigned parent = m_Parent;

	if (parent == ~0U) return false;
	if (parent < nodeIds.size() && nodeIds[parent]) return true;
	return profile.GetNode(parent).IsDescendantOf(profile, nodeIds);
}

void Node::Dump(
		const Profile &profile,
		FILE *out,
		unsigned depth,
		Threshold threshold
		) const
{
	for (unsigned i = 0; i < depth; ++i) fputs("  ", out);
	DumpSelf(profile, out);
	fputc('\n', out);

	if (depth >= threshold.m_MaxDepth) return;
	unsigned opCount = m_AllocCount + m_ReallocCount + m_FreeCount;
	if (opCount < threshold.m_MinOpCount
			&& m_Allocated < threshold.m_MinAllocated)
	{
		if (threshold.m_DepthBelow == 0) return;
		threshold.m_MaxDepth = std::min(
				threshold.m_MaxDepth,
				depth + threshold.m_DepthBelow);
	}

	unsigned nChildren = m_nChildren;
	unsigned *children = m_Children;
	for (unsigned i = 0; i < nChildren; ++i)
	{
		unsigned childId = children[i];
		const Node &child = profile.GetNode(childId);
		child.Dump(profile, out, depth + 1, threshold);
	}

	DumpOps(profile, out, depth);
}

void Node::Dump(
		const Profile &profile,
		FILE *out,
		unsigned depth,
		const ProfileType &profileType,
		uint64_t threshold
		) const
{
	const uint64_t value = profileType.NodeValue(profile.GetNodeId(*this));

	for (unsigned i = 0; i < depth; ++i) fputs("  ", out);
	fprintf(out, "[%u] %s ", depth, ByteCountToString(value).c_str());
	DumpSelf(profile, out);
	fputc('\n', out);

	if (value < threshold) return;

	unsigned nChildren = m_nChildren;
	unsigned *children = m_Children;
	bool skip = true;
	for (unsigned i = 0; i < nChildren; ++i)
	{
		unsigned childId = children[i];
		uint64_t childValue = profileType.NodeValue(childId);
		if (childValue >= threshold) { skip = false; break; }
	}
	if (!skip)
	{
		for (unsigned i = 0; i < nChildren; ++i)
		{
			unsigned childId = children[i];
			uint64_t childValue = profileType.NodeValue(childId);
			if (childValue < threshold / 10) continue;
			const Node &child = profile.GetNode(childId);
			child.Dump(profile, out, depth + 1, profileType, threshold);
		}
	}

	DumpOps(profile, out, depth);
}

void Node::DumpSelf(const Profile &profile, FILE *out, bool numeric) const
{
	if (numeric)
	{
		unsigned nodeId = static_cast<unsigned>(this - &profile.GetNode(0));
		fprintf(out, "<0x%08x node #%u>", static_cast<unsigned>(m_Addr), nodeId);
	}
	else
		profile.DumpAddr(out, m_Addr);
	fprintf(out,
			" (M:%u, R:%u, F:%u, A:%u/%u, T:%u)",
			m_AllocCount, m_ReallocCount, m_FreeCount,
			m_Allocated, m_AllocatedBlocks, m_TotalAllocated);
}

void Node::DumpStack(
		const Profile &profile,
		FILE *out,
		unsigned frame,
		unsigned indent
		) const
{
	for (unsigned i = 0; i < indent; ++i) fputc(' ', out);
	profile.DumpFrame(out, m_Addr, frame);
	if (m_Parent != ~0U)
		profile.GetNode(m_Parent).DumpStack(profile, out, frame + 1, indent);
}

void Node::GetStack(
		const Profile &profile,
		std::vector<std::string> &stack
		) const
{
	const uint32_t addr = m_Addr;
	const CAddr &cAddr = profile.GetCAddr(addr);
	const char *desc = cAddr.Desc();

	if (desc == NULL)
	{
		char buffer[128];
		snprintf(buffer, sizeof buffer, "0x%08x", static_cast<unsigned>(addr));
		buffer[sizeof buffer - 1] = 0;
		stack.push_back(buffer);
	}
	else
		stack.push_back(desc);
	if (m_Parent != ~0U)
		profile.GetNode(m_Parent).GetStack(profile, stack);
}

void Node::DumpOps(const Profile &profile, FILE *out, unsigned depth) const
{
	static const unsigned maxDumpOps = 5;
	unsigned nOps = static_cast<unsigned>(m_Ops.size());
	const unsigned *ops = &m_Ops.front();
	unsigned nDisplayOps = std::min(nOps, maxDumpOps);

	for (unsigned i = 0; i < nDisplayOps; ++i)
	{
		unsigned opId = ops[i];
		const MemOp &op = profile.GetOp(opId);
		op.Dump(profile, out, depth + 1);
	}
	if (nDisplayOps < nOps)
	{
		for (unsigned i = 0; i < depth; ++i) fputs("  ", out);
		fprintf(out, "  ... %u more operation%s\n",
				nOps - nDisplayOps,
				(nOps - nDisplayOps) == 1 ? "" : "s");
	}

	if (nOps > maxDumpOps)
	{
		unsigned nMalloc = 0, nCalloc = 0, nRealloc = 0;
		unsigned nFree = 0, nValloc = 0, nMemalign = 0;
		unsigned totalMalloc = 0, totalCalloc = 0, totalRealloc = 0;
		unsigned totalFree = 0, totalValloc = 0, totalMemalign = 0;
		for (unsigned i = 0; i < nOps; ++i)
		{
			unsigned opId = ops[i];
			const MemOp &op = profile.GetOp(opId);
			switch (op.Fn())
			{
			case FN_calloc:
				++nCalloc;
				totalCalloc += op.Size();
				break;
			case FN_malloc:
				++nMalloc;
				totalMalloc += op.Size();
				break;
			case FN_realloc:
				++nRealloc;
				totalRealloc += op.Size();
				break;
			case FN_valloc:
				++nValloc;
				totalValloc += op.Size();
				break;
			case FN_memalign:
				++nMemalign;
				totalMemalign += op.Size();
				break;
			case FN_free:
				++nFree;
				totalFree += op.FreedSize(profile);
			}
		}
		if (nCalloc + nMalloc + nRealloc + nValloc + nMemalign + nFree > 0)
		{
			for (unsigned i = 0; i <= depth; ++i) fputs("  ", out);
			bool comma = false;
			if (nCalloc > 0)
			{
				fprintf(out, "calloc: %u/%u",
						static_cast<unsigned>(totalCalloc),
						static_cast<unsigned>(nCalloc));
				comma = true;
			}
			if (nMalloc > 0)
			{
				fprintf(out, "%smalloc: %u/%u",
						comma ? ", " : "",
						static_cast<unsigned>(totalMalloc),
						static_cast<unsigned>(nMalloc));
				comma = true;
			}
			if (nRealloc > 0)
			{
				fprintf(out, "%srealloc: %u/%u",
						comma ? ", " : "",
						static_cast<unsigned>(totalRealloc),
						static_cast<unsigned>(nRealloc));
				comma = true;
			}
			if (nValloc > 0)
			{
				fprintf(out, "%svalloc: %u/%u",
						comma ? ", " : "",
						static_cast<unsigned>(totalValloc),
						static_cast<unsigned>(nValloc));
				comma = true;
			}
			if (nMemalign > 0)
			{
				fprintf(out, "%smemalign: %u/%u",
						comma ? ", " : "",
						static_cast<unsigned>(totalMemalign),
						static_cast<unsigned>(nMemalign));
				comma = true;
			}
			if (nFree > 0)
			{
				if (totalFree > 0)
					fprintf(out, "%sfree: %u/%u",
							comma ? ", " : "",
							static_cast<unsigned>(totalFree),
							static_cast<unsigned>(nFree));
				else
					fprintf(out, "%sfree: -/%u",
							comma ? ", " : "",
							static_cast<unsigned>(nFree));
			}
			fputc('\n', out);
		}
	}
}

unsigned Thread::AllocCount(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned allocCount = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		allocCount += profile.GetNode(rootId).AllocCount();
	}
	return allocCount;
}

unsigned Thread::ReallocCount(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned reallocCount = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		reallocCount += profile.GetNode(rootId).ReallocCount();
	}
	return reallocCount;
}

unsigned Thread::FreeCount(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned freeCount = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		freeCount += profile.GetNode(rootId).FreeCount();
	}
	return freeCount;
}

unsigned Thread::Allocated(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned allocated = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		allocated += profile.GetNode(rootId).Allocated();
	}
	return allocated;
}

unsigned Thread::AllocatedBlocks(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned allocatedBlocks = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		allocatedBlocks += profile.GetNode(rootId).AllocatedBlocks();
	}
	return allocatedBlocks;
}

unsigned Thread::TotalAllocated(const Profile &profile) const
{
	const size_t nRoots = m_Roots.size();
	unsigned totalAllocated = 0;

	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		totalAllocated += profile.GetNode(rootId).TotalAllocated();
	}
	return totalAllocated;
}

void Thread::Dump(
		const Profile &profile,
		FILE *out,
		const Node::Threshold &threshold
		) const
{
	fprintf(out, "<Thread id='0x%08x'>\n", m_ThreadId);
	const size_t nRoots = m_Roots.size();
	for (size_t i = 0; i < nRoots; ++i)
	{
		const unsigned rootId = m_Roots[i];
		const Node &root = profile.GetNode(rootId);
		root.Dump(profile, out, 0, threshold);
	}
	fputs("</Thread>\n", out);
}

void Thread::DumpSummary(
		const Profile &profile,
		FILE *out
		) const
{
	unsigned nRoots = static_cast<unsigned>(m_Roots.size());

	fprintf(out,
			"thread ID %u (0x%08x), %u root node%s:\n",
			static_cast<unsigned>(m_ThreadId),
			static_cast<unsigned>(m_ThreadId),
			nRoots, nRoots == 1 ? "" : "s");
	for (unsigned i = 0; i < nRoots; ++i)
	{
		fputs("  ", out);
		profile.GetNode(m_Roots[i]).DumpSelf(profile, out);
		fputc('\n', out);
	}
	Profile::DumpSummaryCounters(
			out,
			AllocCount(profile),
			ReallocCount(profile),
			FreeCount(profile),
			Allocated(profile),
			AllocatedBlocks(profile),
			TotalAllocated(profile));
}

void Profile::Dump(FILE *out, const Node::Threshold &threshold) const
{
	const size_t nThreads = m_Threads.size();
	fprintf(out, "<Profile nOps='%lu' nThreads='%lu' nNodes='%lu'>\n",
			static_cast<unsigned long>(m_Ops.size()),
			static_cast<unsigned long>(nThreads),
			static_cast<unsigned long>(m_Nodes.size()));
	for (size_t i = 0; i < nThreads; ++i)
		m_Threads[i].Dump(*this, out, threshold);
	fputs("</Profile>\n", out);
}

void Profile::DumpSummary(FILE *out) const
{
	const size_t nThreads = m_Threads.size();
	unsigned allocCount = 0, reallocCount = 0, freeCount = 0;
	unsigned allocated = 0, allocatedBlocks = 0, totalAllocated = 0;

	fputs("Profile summary:\n", out);
	fprintf(out,
			"  counted %u memory operations (total)\n"
			"  stack tree: %u unique code addresses, %u inner nodes\n",
			static_cast<unsigned>(m_Ops.size()),
			static_cast<unsigned>(m_AddrMap.size()),
			static_cast<unsigned>(m_Nodes.size()));
	for (size_t i = 0; i < nThreads; ++i)
	{
		const Thread &thread = m_Threads[i];
		thread.DumpSummary(*this, out);
		allocCount += thread.AllocCount(*this);
		reallocCount += thread.ReallocCount(*this);
		freeCount += thread.FreeCount(*this);
		allocated += thread.Allocated(*this);
		allocatedBlocks += thread.AllocatedBlocks(*this);
		totalAllocated += thread.TotalAllocated(*this);
	}
	fputs("All threads:\n", out);
	DumpSummaryCounters(
			out,
			allocCount,
			reallocCount,
			freeCount,
			allocated,
			allocatedBlocks,
			totalAllocated);
}

void Profile::DumpSummaryCounters(
		FILE *out,
		unsigned allocCount,
		unsigned reallocCount,
		unsigned freeCount,
		unsigned allocated,
		unsigned allocatedBlocks,
		unsigned totalAllocated
		)
{
	fprintf(out,
			"  # allocations: %u\n"
			"  # re-allocations: %u\n"
			"  # frees: %u\n"
			"  %s bytes allocated (in %u memory blocks)\n"
			"  %s bytes allocated total\n",
			allocCount,
			reallocCount,
			freeCount,
			ByteCountToString(allocated).c_str(), allocatedBlocks,
			ByteCountToString(totalAllocated).c_str());
}

namespace
{
	void SimplifySourceLocation(char *line)
	{
		char *at = strstr(line, " at ");
		if (at == NULL) return;
		while (true)
		{
			char *dotSlash = strstr(line, "/./");
			if (dotSlash == NULL) break;
			for (char *p = dotSlash + 2;; ++p) { p[-2] = p[0]; if (!*p) break; }
		}
		char *pathBegin = at + 4;
		char *pathEnd = pathBegin + strlen(pathBegin);
		char *p = pathEnd, *q;
		for (int i = 0; i < 2; ++i)
		{
			if (p > pathBegin) --p;
			while (p > pathBegin && *p != '/') --p;
		}
		if (*p == '/') ++p;
		for (q = pathBegin;; ++p, ++q) { *q = *p; if (!*p) break; }
		p = at;
		while (p > line && isdigit(p[-1])) --p;
		if (p > line && p[-1] == '+')
		{
			for (--p, q = at;; ++p, ++q) { *p = *q; if (!*q) break; }
		}
	}
}

void Profile::DumpAddr(FILE *out, uint32_t addr) const
{
	std::set<CAddr>::const_iterator it = m_AddrMap.find(addr);
	const char *desc = NULL;
	if (it != m_AddrMap.end()) desc = it->Desc();
	if (desc == NULL)
		fprintf(out, "<0x%08x>", static_cast<unsigned>(addr));
	else
	{
		char descBuffer[strlen(desc) + 1];
		strcpy(descBuffer, desc);
		SimplifySourceLocation(descBuffer);
		desc = descBuffer;
		fprintf(out, "<0x%08x: %s>", static_cast<unsigned>(addr), desc);
	}
}

void Profile::DumpFrame(FILE *out, uint32_t addr, unsigned frame) const
{
	fprintf(out, "#%u ", frame);
	DumpAddr(out, addr);
	fputc('\n', out);
}

void ProfileType::Collect(
		uint32_t addr,
		std::vector<unsigned> &referringNodes,
		std::vector<unsigned> &referringOps
		) const
{
	const CAddr &cAddr = m_Profile.GetCAddr(addr);
	referringNodes = cAddr.Nodes();
	size_t nReferringNodes = referringNodes.size();
	referringOps = cAddr.Ops();
	size_t nReferringOps = referringOps.size();
	std::vector<bool> referringNodeMap;
	unsigned maxReferringNodeId = 0;

	for (size_t i = 0; i < nReferringNodes; ++i)
		maxReferringNodeId = std::max(maxReferringNodeId, referringNodes[i]);
	referringNodeMap.assign(maxReferringNodeId + 1, false);
	for (size_t i = 0; i < nReferringNodes; ++i)
		referringNodeMap[referringNodes[i]] = true;

	// Eliminate recursive calls.
	if (nReferringOps > 0)
	{
		for (size_t i = nReferringOps - 1;; --i)
		{
			const MemOp &op = m_Profile.GetOp(referringOps[i]);
			if (op.IsDescendantOf(m_Profile, referringNodeMap))
			{
				referringOps.erase(referringOps.begin() + i);
				--nReferringOps;
			}
			if (i == 0) break;
		}
	}
	if (nReferringNodes > 0)
	{
		for (size_t i = nReferringNodes - 1;; --i)
		{
			const Node &node = m_Profile.GetNode(referringNodes[i]);
			if (node.IsDescendantOf(m_Profile, referringNodeMap))
			{
				referringNodeMap[referringNodes[i]] = false;
				referringNodes.erase(referringNodes.begin() + i);
				--nReferringNodes;
			}
			if (i == 0) break;
		}
	}
}

uint64_t ProfileType::Value(uint32_t addr) const
{
	std::vector<unsigned> referringNodes;
	std::vector<unsigned> referringOps;
	Collect(addr, referringNodes, referringOps);
	const size_t nReferringNodes = referringNodes.size();
	const size_t nReferringOps = referringOps.size();
	uint64_t value = 0;

	for (size_t i = 0; i < nReferringNodes; ++i)
	{
		const unsigned nodeId = referringNodes[i];
		value += NodeValue(nodeId);
	}
	for (size_t i = 0; i < nReferringOps; ++i)
	{
		const unsigned opId = referringOps[i];
		value += OpValue(opId);
	}
	return value;
}

const char *ProfileAllocated::Name() const { return "allocated memory"; }

const char *ProfileAllocated::Desc() const { return "allocated"; }

uint64_t ProfileAllocated::NodeValue(unsigned nodeId) const
{
	const Node &node = m_Profile.GetNode(nodeId);

	return node.Allocated();
}

uint64_t ProfileAllocated::OpValue(unsigned opId) const
{
	const MemOp &op = m_Profile.GetOp(opId);

	if (op.IsAlloc() || op.IsRealloc())
	{
		if (op.FreeId() != ~0U) return 0;
		return op.Size();
	}
	else
		return 0;
}

uint64_t ProfileAllocated::Limit() const { return 10 * 1024 * 1024; }

const char *ProfileOpCount::Name() const { return "memory operation count"; }

const char *ProfileOpCount::Desc() const { return "op count"; }

uint64_t ProfileOpCount::NodeValue(unsigned nodeId) const
{
	const Node &node = m_Profile.GetNode(nodeId);

	return node.AllocCount() + node.ReallocCount() + node.FreeCount();
}

uint64_t ProfileOpCount::OpValue(unsigned) const { return 1; }

uint64_t ProfileOpCount::Limit() const { return 100 * 1024; }

const CAddr &Profile::GetCAddr(uint32_t addr) const
{
	static const CAddr nullAddr;
	std::set<CAddr>::const_iterator it = m_AddrMap.find(addr);
	if (it == m_AddrMap.end()) return nullAddr;
	return *it;
}

namespace
{
	inline uint32_t ReadUInt32(FILE *fp)
	{
		uint8_t buffer[4];

		fread(buffer, 1, 4, fp);
		return (uint32_t)buffer[0] << 24 | ((uint32_t)buffer[1] << 16)
			| ((uint32_t)buffer[2] << 8) | ((uint32_t)buffer[3]);
	}
}

int Profile::ReadOps(const char *filename)
{
	FILE *fp = fopen(filename, "rb");
	static const unsigned maxStack = 4096;

//XXX DEBUG
static const unsigned maxOps = 5000;

	if (fp == NULL)
	{
		fprintf(stderr, "error opening MTrace file '%s': %s\n",
				filename, strerror(errno));
		return -1;
	}

	assert(m_Ops.empty());

	bool error = false;
	printf("reading MTrace file '%s' ...\n", filename);
	uint8_t header_buffer[12] = { };
	fread(header_buffer, 1, sizeof header_buffer, fp);
	if (feof(fp) || memcmp(header_buffer, "MTRACE:", 8) != 0)
	{
		fprintf(stderr, "'%s' is not an MTrace file\n", filename);
		exit(EXIT_FAILURE);
	}
	uint32_t header_version = ((uint32_t)header_buffer[8] << 24)
			| ((uint32_t)header_buffer[9] << 16)
			| ((uint32_t)header_buffer[10] << 8)
			| ((uint32_t)header_buffer[11]);
	if (header_version != 1)
	{
		fprintf(stderr,
				"unsupported MTrace file format version %u for MTrace file '%s'\n",
				static_cast<unsigned>(header_version), filename);
		exit(EXIT_FAILURE);
	}

	unsigned opCount = 0;
	unsigned opCount_calloc = 0, opCount_malloc = 0, opCount_free = 0;
	unsigned opCount_realloc = 0, opCount_valloc = 0, opCount_memalign = 0;
	while (!feof(fp))
	{
		MemOpFn fn = static_cast<MemOpFn>(ReadUInt32(fp));
		uint32_t threadId = ReadUInt32(fp);
		if (feof(fp)) break;
		int nArgs = NumArgs(fn);
		if (nArgs == -1)
		{
			fprintf(stderr, "bad FN code 0x%08x in MTrace file '%s'\n",
					static_cast<unsigned>(fn), filename);
			error = true;
			break;
		}
		uint32_t args[3] = { 0, 0, 0 };
		for (int i = 0; i < nArgs; ++i) args[i] = ReadUInt32(fp);
		uint32_t stack[maxStack];
		unsigned stackDepth = 0;
		while (true)
		{
			if (stackDepth == maxStack)
			{
				fprintf(stderr, "stack too deep in MTrace file '%s'\n", filename);
				error = true;
				break;
			}
			uint32_t ip = ReadUInt32(fp);
			if (ip == ~0U || feof(fp)) break;
			stack[stackDepth] = ip;
			++stackDepth;
		}
		if (error || feof(fp)) break;
		if (stackDepth == 0)
		{
			fprintf(stderr, "missing operation stack in MTrace file '%s'\n",
					filename);
			error = true;
			break;
		}
		unsigned opId = static_cast<unsigned>(m_Ops.size());
		m_Ops.push_back(MemOp(fn, stack[0], args));
		ProcessOp(opId, threadId, stackDepth, stack);
		++opCount;
		switch (fn)
		{
		case FN_calloc: ++opCount_calloc; break;
		case FN_malloc: ++opCount_malloc; break;
		case FN_free: ++opCount_free; break;
		case FN_realloc: ++opCount_realloc; break;
		case FN_valloc: ++opCount_valloc; break;
		case FN_memalign: ++opCount_memalign; break;
		}
		if (opCount % 100000 == 0)
			printf("%u operations read (C:%u M:%u F:%u R:%u V:%u MA:%u)\n",
					opCount, opCount_calloc, opCount_malloc, opCount_free,
					opCount_realloc, opCount_valloc, opCount_memalign);
//XXX DEBUG
//if (opCount == maxOps) break;
	}
	fclose(fp);
	if (error)
		return -1;

	printf("done reading MTrace file '%s'.\n", filename);
	printf(
			"%u operations:\n"
			"  malloc: %u, calloc: %u, free: %u, realloc: %u\n"
			"  valloc: %u, memalign: %u\n",
			opCount, opCount_malloc, opCount_calloc, opCount_free,
			opCount_realloc, opCount_valloc, opCount_memalign);

	return 0;
}

void Profile::ProcessOp(
		unsigned opId,
		uint32_t threadId,
		uint32_t stackDepth,
		const uint32_t *stack
		)
{
	std::vector<Thread> &threads = m_Threads;
	std::vector<Node> &nodes = m_Nodes;
	std::vector<MemOp> &ops = m_Ops;
	size_t nThreads = threads.size();
	Thread *thread = NULL;

	// Locate the thread associated with the memory operation.
	for (size_t i = 0; i < nThreads; ++i)
	{
		if (threads[i].ThreadId() == threadId)
		{
			thread = &threads[i];
			break;
		}
	}
	if (thread == NULL)
	{
		threads.push_back(Thread(threadId));
		thread = &threads.back();
	}

	// Find the root node based on the bottom frame of the stack.  If no
	// matching root node is found, a new root node is created.
	const std::vector<unsigned> &threadRoots = thread->Roots();
	size_t nThreadRoots = threadRoots.size();
	int stackIndex = static_cast<int>(stackDepth) - 1;
	assert(stackIndex >= 0);
	const uint32_t rootAddr = stack[stackIndex];
	unsigned nodeId = ~0U;
	for (size_t i = 0; i < nThreadRoots; ++i)
	{
		unsigned rootId = threadRoots[i];
		if (nodes[rootId].Addr() == rootAddr) { nodeId = rootId; break; }
	}
	if (nodeId == ~0U)
	{
		// Create a new root node.
		nodeId = static_cast<unsigned>(nodes.size());
		nodes.push_back(Node(rootAddr, ~0U));
		thread->AddRoot(nodeId);
	}

	// Process the stack and build a tree branch for the stack.
	--stackIndex;
	while (stackIndex > 0)
	{
		const uint32_t addr = stack[stackIndex];
		unsigned nChildren = 0;
		const unsigned *children = nodes[nodeId].Children(nChildren);
		unsigned childNodeId = ~0U;
		for (unsigned i = 0; i < nChildren; ++i)
		{
			const unsigned childId = children[i];
			const Node &child = nodes[childId];
			if (child.Addr() == addr) { childNodeId = childId; break; }
		}
		if (childNodeId == ~0U)
		{
			childNodeId = static_cast<unsigned>(nodes.size());
			nodes.push_back(Node(addr, nodeId));
			nodes[nodeId].AddChild(childNodeId);
		}
		nodeId = childNodeId;
		--stackIndex;
	}

	// The final node is associated with the operation.
	nodes[nodeId].AddOp(opId);
	ops[opId].SetParent(nodeId);
}

void Profile::UpdateTree()
{
	typedef std::map<uint32_t, unsigned, std::greater<uint32_t> > MemMapT;
	MemMapT memMap;
	std::vector<MemOp> &ops = m_Ops;
	const unsigned nOps = static_cast<unsigned>(ops.size());

	for (unsigned opId = 0; opId < nOps; ++opId)
	{
		const MemOp &op = ops[opId];
		if (op.IsRealloc() || op.IsFree())
		{
			uint32_t freePointer = op.Arg(0);
			MemMapT::iterator it = memMap.find(freePointer);
			if (it == memMap.end())
			{
				// Check if the pointer is within an allocated block.
				it = memMap.lower_bound(freePointer);
				if (it != memMap.end())
				{
					unsigned opBelowId = it->second;
					const MemOp &opBelow = ops[opBelowId];
					assert(!opBelow.IsFree());
					uint32_t opBelowPointer = opBelow.Pointer();
					uint32_t opBelowSize = opBelow.Size();
					assert(opBelowPointer < freePointer);
					if (opBelowPointer + opBelowSize > freePointer)
					{
						fprintf(stderr,
								"MEMORY ERROR: "
								"address 0x%08x in %s() points into allocated block "
								"0x%08x-0x%08x\n",
								static_cast<unsigned>(freePointer),
								MemOpFnName(op.Fn()),
								static_cast<unsigned>(opBelowPointer),
								static_cast<unsigned>(opBelowPointer + opBelowSize));
						fprintf(stderr,
								"stack trace of allocated block 0x%08x:\n",
								static_cast<unsigned>(opBelowPointer));
						opBelow.DumpStack(*this, stderr);
						fprintf(stderr,
								"stack trace of %s() referring to 0x%08x:\n",
								MemOpFnName(op.Fn()),
								static_cast<unsigned>(freePointer));
						op.DumpStack(*this, stderr);
						fputc('\n', stderr);
						continue;
					}
				}
				if (freePointer != 0)
				{
					fprintf(stderr,
							"warning: unknown address 0x%08x in %s()\n",
							static_cast<unsigned>(freePointer),
							MemOpFnName(op.Fn()));
					op.DumpStack(*this, stderr);
					fputc('\n', stderr);
				}
				else
				{
					// Free on a NULL pointer is valid.
					// fputs("MEMORY ERROR: free(NULL)\n", stderr);
				}
			}
			else
			{
				unsigned opFreedId = it->second;
				const MemOp &opFreed = ops[opFreedId];
				assert(!opFreed.IsFree());
				assert(opFreed.Pointer() == freePointer);
				memMap.erase(it);
				ops[opId].SetAllocId(opFreedId);
				ops[opFreedId].SetFreeId(opId);
				UpdateFree(opFreedId);
			}
			if (op.IsFree()) UpdateOpCount(opId);
		}
		if (op.IsAlloc() || op.IsRealloc())
		{
			uint32_t allocPointer = op.Pointer();
			MemMapT::const_iterator it = memMap.find(allocPointer);
			if (it != memMap.end())
			{
				fprintf(stderr,
						"MEMORY ERROR: "
						"allocation returns an already allocated address 0x%08x "
						"in %s()\n",
						static_cast<unsigned>(allocPointer),
						MemOpFnName(op.Fn()));
				continue;
			}
			memMap.insert(std::make_pair(allocPointer, opId));
			UpdateAlloc(opId);
			UpdateOpCount(opId);
		}
	}
}

void Profile::UpdateAlloc(uint32_t opId)
{
	std::vector<Node> &nodes = m_Nodes;
	const MemOp &op = m_Ops[opId];
	unsigned nodeId = op.Parent();
	uint32_t size = op.Size();

	while (nodeId != ~0U)
	{
		Node &node = nodes[nodeId];
		node.Alloc(size);
		nodeId = node.Parent();
	}
}

void Profile::UpdateFree(uint32_t opId)
{
	std::vector<Node> &nodes = m_Nodes;
	const MemOp &op = m_Ops[opId];
	unsigned nodeId = op.Parent();
	uint32_t size = op.Size();

	while (nodeId != ~0U)
	{
		Node &node = nodes[nodeId];
		node.Free(size);
		nodeId = node.Parent();
	}
}

void Profile::UpdateOpCount(uint32_t opId)
{
	std::vector<Node> &nodes = m_Nodes;
	const MemOp &op = m_Ops[opId];
	unsigned nodeId = op.Parent();

	if (op.IsAlloc())
	{
		while (nodeId != ~0U)
		{
			Node &node = nodes[nodeId];
			node.CountAlloc();
			nodeId = node.Parent();
		}
	}
	else if (op.IsRealloc())
	{
		while (nodeId != ~0U)
		{
			Node &node = nodes[nodeId];
			node.CountRealloc();
			nodeId = node.Parent();
		}
	}
	else
	{
		assert(op.IsFree());
		while (nodeId != ~0U)
		{
			Node &node = nodes[nodeId];
			node.CountFree();
			nodeId = node.Parent();
		}
	}
}

void Profile::UpdateCAddrMap()
{
	const std::vector<Node> &nodes = m_Nodes;
	const unsigned nNodes = static_cast<unsigned>(nodes.size());
	const std::vector<MemOp> &ops = m_Ops;
	const unsigned nOps = static_cast<unsigned>(ops.size());
	std::set<CAddr> &addrMap = m_AddrMap;
	char lineBuffer[0x10000];

	// Collect all code addresses.
	for (unsigned nodeId = 0; nodeId < nNodes; ++nodeId)
	{
		const Node &node = nodes[nodeId];
		const uint32_t addr = node.Addr();
		std::set<CAddr>::iterator it = addrMap.lower_bound(addr);
		if (*it == addr) { it->AddNode(nodeId); continue; }
		addrMap.insert(it, addr)->AddNode(nodeId);
	}
	for (unsigned opId = 0; opId < nOps; ++opId)
	{
		const MemOp &op = ops[opId];
		const uint32_t addr = op.Addr();
		std::set<CAddr>::iterator it = addrMap.lower_bound(addr);
		if (*it == addr) { it->AddOp(opId); continue; }
		addrMap.insert(it, addr)->AddOp(opId);
	}
	printf("found %u code addresses\n", static_cast<unsigned>(addrMap.size()));

	if (!m_AddrMapFile.empty())
	{
		// Load the addresses from an address mapping file.
		FILE *in = fopen(m_AddrMapFile.c_str(), "r");
		if (in != NULL)
		{
			printf("loading address mapping from '%s'\n", m_AddrMapFile.c_str());
			while (!feof(in))
			{
				lineBuffer[0] = 0;
				fgets(lineBuffer, sizeof lineBuffer, in);
				lineBuffer[sizeof lineBuffer - 1] = 0;
				char *p = lineBuffer;
				uint32_t addr = static_cast<uint32_t>(strtol(lineBuffer, &p, 0));
				if (p == lineBuffer) continue;
				while (isspace(*p)) ++p;
				char *q = p + strlen(p);
				while (q > p && isspace(q[-1])) --q;
				*q = 0;
				std::set<CAddr>::const_iterator it = addrMap.find(addr);
				if (it == addrMap.end())
				{
					// We might encounter unmapped adresses when processing only a part of
					// the MTrace dump.  This is not an error.
					continue;
				}
				it->SetDesc(p);
			}
			fclose(in);
		}
	}

	bool haveUnmappedAddresses = false;
	for (
			std::set<CAddr>::const_iterator
				it = addrMap.begin(), itEnd = addrMap.end();
			it != itEnd;
			++it)
	{
		if (it->Desc() == NULL)
		{
			haveUnmappedAddresses = true;
			break;
		}
	}

	if (haveUnmappedAddresses && !m_ElfFile.empty())
	{
		// Run GDB to translate the code addresses.
		puts("translating addresses...");
		char tmpFile[] = "/tmp/mtrace_XXXXXX";
		int tmpFd = mkstemp(tmpFile);
		if (tmpFd == -1)
		{
			fprintf(stderr, "mkstemp(\"%s\") failed: %s\n",
					tmpFile, strerror(errno));
			exit(EXIT_FAILURE);
		}
		FILE *tmp = fdopen(tmpFd, "w");
		if (tmp == NULL)
		{
			fprintf(stderr, "fdopen(%i, \"w\") failed: %s\n",
					tmpFd, strerror(errno));
			exit(EXIT_FAILURE);
		}
		fprintf(tmp, "file %s\n", m_ElfFile.c_str());
		fputs("set print symbol-filename on\n", tmp);
		fputs("set print demangle on\n", tmp);
		fputs("set print asm-demangle on\n", tmp);
		unsigned totalTranslate = 0;
		for (
				std::set<CAddr>::const_iterator
					it = addrMap.begin(), itEnd = addrMap.end();
				it != itEnd;
				++it)
		{
			if (it->Desc() == NULL)
			{
				fprintf(tmp, "print/a 0x%x\n", it->Addr());
				++totalTranslate;
			}
		}
		fclose(tmp);
		char gdbCommand[1024];
		snprintf(gdbCommand, sizeof gdbCommand,
				"gdb -batch -command=%s", tmpFile);
		gdbCommand[sizeof gdbCommand - 1] = 0;
		FILE *gdbIn = popen(gdbCommand, "r");
		if (gdbIn == NULL)
		{
			fprintf(stderr,
					"popen(\"%s\", \"r\") failed: %s\n",
					gdbCommand, strerror(errno));
			remove(tmpFile);
			exit(EXIT_FAILURE);
		}
		bool error = false;
		unsigned translateCount = 0;
		printf("loading ELF '%s' into GDB...\n", m_ElfFile.c_str());
		while (!feof(gdbIn))
		{
			lineBuffer[0] = 0;
			fgets(lineBuffer, sizeof lineBuffer, gdbIn);
			lineBuffer[sizeof lineBuffer - 1] = 0;
			if (lineBuffer[0] != '$') continue;
			char *p = strstr(lineBuffer, "0x");
			if (p == NULL) { error = true; break; }
			char *pEnd = p;
			uint32_t addr = static_cast<uint32_t>(strtol(p, &pEnd, 0));
			if (p == pEnd) { error = true; break; }
			std::set<CAddr>::const_iterator it = addrMap.find(addr);
			if (it == addrMap.end())
			{
				fprintf(stderr,
						"GDB translated address 0x%08x not mapped\n",
						static_cast<unsigned>(addr));
				error = true;
				break;
			}
			p = pEnd;
			while (*p && isspace(*p)) ++p;
			if (*p)
			{
				if (*p != '<') { error = true; break; }
				char *q = ++p;
				q += strlen(q) - 1;
				while (q > p && isspace(*q)) --q;
				if (*q != '>') { error = true; break; }
				*q = 0;
				it->SetDesc(p);
			}
			else
				it->SetDesc("?");
			++translateCount;
			if (translateCount % 100 == 0)
				printf("%u/%u addresses translated\n",
						translateCount, totalTranslate);
		}
		if (error)
		{
			fprintf(stderr,
					"unexpected GDB output line: %s",
					lineBuffer);
			exit(EXIT_FAILURE);
		}
		remove(tmpFile);

		if (!m_AddrMapFile.empty())
		{
			// Update the address mapping file.
			printf("updating address mapping file '%s'\n", m_AddrMapFile.c_str());
			FILE *out = fopen(m_AddrMapFile.c_str(), "w");
			if (out == NULL)
			{
				fprintf(stderr,
						"error opening address mapping file '%s': %s\n",
						m_AddrMapFile.c_str(), strerror(errno));
			}
			else
			{
				for (
						std::set<CAddr>::const_iterator
							it = addrMap.begin(), itEnd = addrMap.end();
						it != itEnd;
						++it)
				{
					const char *desc = it->Desc();
					if (desc != NULL)
					{
						uint32_t addr = it->Addr();
						fprintf(out, "0x%08x %s\n", static_cast<unsigned>(addr), desc);
					}
				}
				fclose(out);
			}
		}
	}
}

namespace
{
	bool TraceMatch(const char *desc, const char *pattern)
	{
		const char *p = std::strstr(desc, pattern);
		if (p == NULL)
			return false;
		const char *paren = std::strchr(desc, '(');
		if (paren != NULL && paren < p)
			return false;
		const char *angle = std::strchr(desc, '<');
		if (angle != NULL && angle < p)
			return false;
		return true;
	}
}

void Profile::DumpProfile(
		FILE *out,
		const ProfileType &type,
		const std::vector<std::string> *traceList
		) const
{
	std::vector<ProfileType::AddrValuePair> addrVec(
			m_AddrMap.begin(), m_AddrMap.end());
	const std::vector<Node> &nodes = m_Nodes;
	const std::vector<MemOp> &ops = m_Ops;
	const size_t nAddrVec = addrVec.size();
	const size_t nTraceList = traceList != NULL ? traceList->size() : 0;
	bool haveWildcard = false;

	printf("evaluating profile '%s' ...\n", type.Name());
	for (size_t i = 0; i < nAddrVec; ++i)
	{
		addrVec[i].m_Value = type.Value(addrVec[i].m_Addr);
		if (i % 1000 == 999)
			printf(
					"%u/%u addresses evaluated\n",
					static_cast<unsigned>(i + 1),
					static_cast<unsigned>(nAddrVec));
	}
	std::sort(addrVec.begin(), addrVec.end());
	puts("done.");

	if (traceList == NULL)
		haveWildcard = true;
	else
	{
		for (size_t j = 0; j < nTraceList; ++j)
			if (!std::strcmp((*traceList)[j].c_str(), "*"))
			{
				haveWildcard = true;
				break;
			}
	}

	// Do a flat code-address based profile.
	HLine(out, '=');
	fprintf(out, "Profile %s\n", type.Name());
	const uint64_t valueLimit = type.Limit();
	for (size_t i = 0; i < nAddrVec; ++i)
	{
		const uint32_t addr = addrVec[i].m_Addr;
		uint64_t value = addrVec[i].m_Value;
		const CAddr &cAddr = GetCAddr(addr);
		const char *cAddrDesc = cAddr.Desc();
		if (value < valueLimit)
		{
			bool found = false;
			for (size_t j = 0; j < nTraceList; ++j)
			{
				const char *traceFn = (*traceList)[j].c_str();
				if (TraceMatch(cAddrDesc, traceFn))
				{
					found = true;
					break;
				}
			}
			if (!found)
				continue;
		}
		fprintf(out, "%s ", ByteCountToString(value).c_str());
		DumpAddr(out, addr);
		fputc('\n', out);
		std::vector<unsigned> referringNodes = cAddr.Nodes();
		const size_t nReferringNodes = referringNodes.size();
		size_t nNodesDisplayed = 0;
		if (nReferringNodes > 0)
		{
			fprintf(out,
					"  %u referring node%s\n\n",
					static_cast<unsigned>(nReferringNodes),
					nReferringNodes == 1 ? "" : "s");
			std::sort(
					referringNodes.begin(),
					referringNodes.end(),
					CompareNodesByProfileValue(type));
			size_t nDisplayReferringNodes = std::min(nReferringNodes, (size_t)5);
			if (traceList != NULL
					&& cAddrDesc != NULL
					&& nDisplayReferringNodes < nReferringNodes)
			{
				for (size_t j = 0; j < nTraceList; ++j)
				{
					const char *traceFn = (*traceList)[j].c_str();
					if (TraceMatch(cAddrDesc, traceFn))
					{
						nDisplayReferringNodes = nReferringNodes;
						break;
					}
				}
			}
			if (traceList != NULL && !haveWildcard)
				nDisplayReferringNodes = nReferringNodes;
			size_t nSkippedNodes = 0;
			for (size_t i = 0;
					i < nReferringNodes
						&& nNodesDisplayed < nDisplayReferringNodes;
					++i)
			{
				const unsigned nodeId = referringNodes[i];
				const Node &node = nodes[nodeId];
				if (!haveWildcard)
				{
					bool skip = true;
					std::vector<std::string> nodeStack;
					node.GetStack(*this, nodeStack);
					const size_t nNodeStack = nodeStack.size();
					for (size_t j = 0; j < nTraceList; ++j)
					{
						const char *tracePattern = (*traceList)[j].c_str();
						for (size_t k = 0; k < nNodeStack; ++k)
						{
							if (TraceMatch(nodeStack[k].c_str(), tracePattern))
							{
								skip = false;
								break;
							}
						}
						if (!skip)
							break;
					}
					if (skip)
					{
						++nSkippedNodes;
						continue;
					}
				}
				++nNodesDisplayed;
				fprintf(out,
						"  %s referred from ",
						ByteCountToString(type.NodeValue(nodeId)).c_str());
				node.DumpSelf(*this, out, true);
				fputs(":\n", out);
				node.DumpStack(*this, out, 0, 2);
				fputs("\n  call tree:\n", out);
				node.Dump(*this, out, 1, type,
						std::min(valueLimit, value / nReferringNodes));
				fputc('\n', out);
			}
			if (nNodesDisplayed > 0 && nNodesDisplayed < nReferringNodes)
				fprintf(out,
						"  ... %u more referring node%s\n\n",
						static_cast<unsigned>(nReferringNodes - nNodesDisplayed),
						nReferringNodes - nNodesDisplayed == 1 ? "" : "s");
		}
		std::vector<unsigned> referringOps = cAddr.Ops();
		const size_t nReferringOps = referringOps.size();
		if (nReferringOps > 0)
		{
			fprintf(out,
					"  %u referring memory operation%s\n\n",
					static_cast<unsigned>(nReferringOps),
					nReferringOps == 1 ? "" : "s");
			std::sort(
					referringOps.begin(),
					referringOps.end(),
					CompareOpsByProfileValue(type));
			size_t nDisplayReferringOps = std::min(nReferringOps, (size_t)5);
			if (traceList != NULL && !haveWildcard)
				nDisplayReferringOps = std::min(nReferringOps, (size_t)1000);
			size_t nSkippedOps = 0;
			size_t nOpsDisplayed = 0;
			for (size_t i = 0;
					i < std::min(nReferringOps, (size_t)10000)
						&& nOpsDisplayed < nDisplayReferringOps;
					++i)
			{
				const unsigned opId = referringOps[i];
				const MemOp &op = ops[opId];
				if (!haveWildcard)
				{
					bool skip = true;
					std::vector<std::string> nodeStack;
					op.GetStack(*this, nodeStack);
					const size_t nNodeStack = nodeStack.size();
					for (size_t j = 0; j < nTraceList; ++j)
					{
						const char *tracePattern = (*traceList)[j].c_str();
						for (size_t k = 0; k < nNodeStack; ++k)
						{
							if (TraceMatch(nodeStack[k].c_str(), tracePattern))
							{
								skip = false;
								break;
							}
						}
						if (!skip)
							break;
					}
					if (skip)
					{
						++nSkippedOps;
						continue;
					}
				}
				++nOpsDisplayed;
				fprintf(out,
						"  %s referred from ",
						ByteCountToString(type.OpValue(opId)).c_str());
				op.Dump(*this, out);
				op.DumpStack(*this, out, 2);
				fputc('\n', out);
			}
			if (nOpsDisplayed > 0 && nOpsDisplayed < nReferringOps)
				fprintf(out,
						"  ... %u more referring operation%s\n",
						static_cast<unsigned>(nReferringOps - nOpsDisplayed),
						nReferringOps - nOpsDisplayed == 1 ? "" : "s");
		}
		HLine(out, '-');
	}
	fputc('\n', out);
}

std::string ByteCountToString(const uint64_t count)
{
  static const uint64_t giga = 1024 * 1024 * 1024;
  static const uint64_t mega = 1024 * 1024;
  static const uint64_t kilo = 1024;
  char buffer[128];

  if (count > giga / 1.2)
    snprintf(buffer, sizeof buffer, "%.2fG",
        static_cast<double>(count) / giga);
  else if (count > mega / 1.2)
    snprintf(buffer, sizeof buffer, "%.2fM",
        static_cast<double>(count) / mega);
  else if (count > kilo / 1.2)
    snprintf(buffer, sizeof buffer, "%.2fK",
        static_cast<double>(count) / kilo);
  else
    snprintf(buffer, sizeof buffer, "%u",
        static_cast<unsigned>(count));
  buffer[sizeof buffer - 1] = 0;
  return buffer;
}

void DumpMemInfo(const char *message)
{
#if __GNUC__ >= 4
  struct mallinfo info = mallinfo();

  printf(
      "memory statistics (%s):\n"
      "  total sbrk = %s, unused chunks = %s, total chunks = %s\n"
      "  occupied = %s, free = %s\n",
      message,
      ByteCountToString(info.arena).c_str(),
      ByteCountToString(info.ordblks).c_str(),
      ByteCountToString(info.hblks).c_str(),
      ByteCountToString(info.uordblks).c_str(),
      ByteCountToString(info.fordblks).c_str());
#endif // __GNUC__ >= 4
}

void HLine(FILE *out, char c, unsigned indent)
{
	for (unsigned i = 0; i < indent; ++i) fputc(' ', out);
	for (unsigned i = indent; i < 78; ++i) fputc(c, out);
	fputc('\n', out);
}

namespace
{
	void Help(FILE *out)
	{
		fputs("Help()", out);
	}
}

int profileMain(int argc, char **argv)
{
	std::vector<std::string> profileModeList;
	std::vector<std::string> traceList;

	while (true)
	{
		static const option long_options[] = {
			{ "profile", 1, NULL, 'p' },
			{ "trace", 1, NULL, 't' },
			{ "help", 0, NULL, 'h' },
			{ NULL, 0, NULL, 0 }
		};
		int option_index = 0;
		int c = getopt_long(argc, argv, "p:t:h", long_options, &option_index);
		if (c == -1)
			break;
		switch (c)
		{
		case 'h':
			Help(stdout);
			exit(EXIT_SUCCESS);
		case 'p':
			profileModeList.push_back(optarg);
			break;
		case 't':
			traceList.push_back(optarg);
			break;
		default:
			exit(EXIT_FAILURE);
		}
	}
	if (profileModeList.empty())
	{
		profileModeList.push_back("allocated");
		if (traceList.empty())
			profileModeList.push_back("op_count");
	}

	if (argc - optind < 3 || argc - optind > 4)
	{
		fprintf(stderr,
				"usage: %s [(options)] "
				"(elf-file) (trace-file) (output-file) [(addrmap-file)]\n",
				argv[0]);
		exit(EXIT_FAILURE);
	}
	const char *elfFile = argv[optind];
	const char *traceFile = argv[optind + 1];
	const char *outputFile = argv[optind + 2];
	const char *addrMapFile = argc == optind + 4 ? argv[optind + 3] : NULL;

	Profile profile;
	Node::Threshold threshold;
	threshold.m_MaxDepth = ~0U;
	threshold.m_MinOpCount = 10000;
	threshold.m_MinAllocated = 100000;
	threshold.m_DepthBelow = 0;

	profile.SetElfFile(elfFile);
	if (addrMapFile != NULL) profile.SetAddrMapFile(addrMapFile);
	if (profile.ReadOps(traceFile) == -1) exit(EXIT_FAILURE);
	DumpMemInfo("after ReadOps()");
	profile.UpdateCAddrMap();
	DumpMemInfo("after UpdateCAddrMap()");
	profile.UpdateTree();
	DumpMemInfo("after UpdateTree()");

	FILE *out = fopen(outputFile, "w");
	if (out == NULL)
	{
		fprintf(stderr,
				"error opening output file '%s': %s\n",
				outputFile, strerror(errno));
		exit(EXIT_FAILURE);
	}
	//profile.Dump(out, threshold);
	profile.DumpSummary(out);
	const std::vector<std::string> *traceListP = NULL;
	if (!traceList.empty())
		traceListP = &traceList;
	const size_t nProfileModeList = profileModeList.size();
	for (size_t i = 0; i < nProfileModeList; ++i)
	{
		const char *profileMode = profileModeList[i].c_str();
		if (!std::strcmp(profileMode, "allocated"))
			profile.DumpProfile(out, ProfileAllocated(profile), traceListP);
		else if (!std::strcmp(profileMode, "op_count"))
			profile.DumpProfile(out, ProfileOpCount(profile), traceListP);
		else
		{
			fprintf(stderr,
					"warning: unrecognized profile mode '%s' (ignored)\n",
					profileMode);
		}
	}
	fclose(out);

	return 0;
}

int traceMain(int argc, char **argv)
{
	std::vector<uint32_t> addrList;

	while (true)
	{
		static const option long_options[] = {
			{ "help", 0, NULL, 'h' },
			{ NULL, 0, NULL 0 }
		};
		int option_index = 0;
		int c = getopt_long(argc, argv, "h", long_options, &option_index);
		if (c == -1)
			break;
		switch (c)
		{
		case 'h':
			Help(stdout);
			exit(EXIT_SUCCESS);
		default:
			exit(EXIT_FAILURE);
		}
	}
	for (int i = optind; i < argc; ++i)
	{
		long long addrLL = strtoll(argv[i], NULL, 0);
		if (addrLL != 0)
			addrList.push_back(static_cast<uint32_t>(addrLL));
	}

	// XXX

	return 1;
}

int main(int argc, char **argv)
{
	if (argc == 1)
	{
		fprintf(stderr, "missing arguments\n");
		exit(EXIT_FAILURE);
	}
	const char *command = argv[1];
	if (!strcmp(command, "profile"))
		return profileMain(argc - 1, argv + 1);
	else if (!strcmp(command, "trace"))
		return traceMain(argc - 1, argv + 1);
	else
		return profileMain(argc, argv);
}

