From 81406018b0688e956452cd3e00c1ab9aeb9af764 Mon Sep 17 00:00:00 2001 From: Dam Sunwoo Date: Fri, 2 Nov 2012 11:32:01 -0500 Subject: ARM: dump stats and process info on context switches This patch enables dumping statistics and Linux process information on context switch boundaries (__switch_to() calls) that are used for Streamline integration (a graphical statistics viewer from ARM). --- src/arch/arm/ArmSystem.py | 1 + src/arch/arm/linux/system.cc | 104 ++++++++++++++++++++++++++++++++++++++++++- src/arch/arm/linux/system.hh | 42 ++++++++++++++++- src/cpu/base.cc | 11 +++++ src/cpu/base.hh | 19 ++++++++ src/cpu/pc_event.cc | 1 - src/mem/request.hh | 34 ++++++++++++++ src/sim/serialize.hh | 2 +- util/cpt_upgrader.py | 13 ++++++ 9 files changed, 222 insertions(+), 5 deletions(-) diff --git a/src/arch/arm/ArmSystem.py b/src/arch/arm/ArmSystem.py index db0febe18..3ca9b8573 100644 --- a/src/arch/arm/ArmSystem.py +++ b/src/arch/arm/ArmSystem.py @@ -71,3 +71,4 @@ class LinuxArmSystem(ArmSystem): "File that contains the Device Tree Blob. Don't use DTB if empty.") early_kernel_symbols = Param.Bool(False, "enable early kernel symbol tables before MMU") + enable_context_switch_stats_dump = Param.Bool(False, "enable stats/task info dumping at context switch boundaries") diff --git a/src/arch/arm/linux/system.cc b/src/arch/arm/linux/system.cc index 1347e472d..b06439406 100644 --- a/src/arch/arm/linux/system.cc +++ b/src/arch/arm/linux/system.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 ARM Limited + * Copyright (c) 2010-2012 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -44,19 +44,24 @@ #include "arch/arm/linux/system.hh" #include "arch/arm/isa_traits.hh" #include "arch/arm/utility.hh" +#include "arch/generic/linux/threadinfo.hh" #include "base/loader/object_file.hh" #include "base/loader/symtab.hh" +#include "cpu/base.hh" +#include "cpu/pc_event.hh" #include "cpu/thread_context.hh" #include "debug/Loader.hh" #include "kern/linux/events.hh" #include "mem/fs_translating_port_proxy.hh" #include "mem/physical.hh" +#include "sim/stat_control.hh" using namespace ArmISA; using namespace Linux; LinuxArmSystem::LinuxArmSystem(Params *p) - : ArmSystem(p) + : ArmSystem(p), + enableContextSwitchStatsDump(p->enable_context_switch_stats_dump) { #ifndef NDEBUG kernelPanicEvent = addKernelFuncEvent("panic"); @@ -206,6 +211,9 @@ LinuxArmSystem::~LinuxArmSystem() delete uDelaySkipEvent; if (constUDelaySkipEvent) delete constUDelaySkipEvent; + + if (dumpStatsPCEvent) + delete dumpStatsPCEvent; } LinuxArmSystem * @@ -213,3 +221,95 @@ LinuxArmSystemParams::create() { return new LinuxArmSystem(this); } + +void +LinuxArmSystem::startup() +{ + if (enableContextSwitchStatsDump) { + dumpStatsPCEvent = addKernelFuncEvent("__switch_to"); + if (!dumpStatsPCEvent) + panic("dumpStatsPCEvent not created!"); + + std::string task_filename = "tasks.txt"; + taskFile = simout.create(name() + "." + task_filename); + + for (int i = 0; i < _numContexts; i++) { + ThreadContext *tc = threadContexts[i]; + uint32_t pid = tc->getCpuPtr()->getPid(); + if (pid != Request::invldPid) { + mapPid(tc, pid); + tc->getCpuPtr()->taskId(taskMap[pid]); + } + } + } +} + +void +LinuxArmSystem::mapPid(ThreadContext *tc, uint32_t pid) +{ + // Create a new unique identifier for this pid + std::map::iterator itr = taskMap.find(pid); + if (itr == taskMap.end()) { + uint32_t map_size = taskMap.size(); + if (map_size > ContextSwitchTaskId::MaxNormalTaskId + 1) { + warn_once("Error out of identifiers for cache occupancy stats"); + taskMap[pid] = ContextSwitchTaskId::Unknown; + } else { + taskMap[pid] = map_size; + } + } +} + +/** This function is called whenever the the kernel function + * "__switch_to" is called to change running tasks. + * + * r0 = task_struct of the previously running process + * r1 = task_info of the previously running process + * r2 = task_info of the next process to run + */ +void +DumpStatsPCEvent::process(ThreadContext *tc) +{ + Linux::ThreadInfo ti(tc); + Addr task_descriptor = tc->readIntReg(2); + uint32_t pid = ti.curTaskPID(task_descriptor); + uint32_t tgid = ti.curTaskTGID(task_descriptor); + std::string next_task_str = ti.curTaskName(task_descriptor); + + // Streamline treats pid == -1 as the kernel process. + // Also pid == 0 implies idle process (except during Linux boot) + int32_t mm = ti.curTaskMm(task_descriptor); + bool is_kernel = (mm == 0); + if (is_kernel && (pid != 0)) { + pid = -1; + tgid = -1; + next_task_str = "kernel"; + } + + LinuxArmSystem* sys = dynamic_cast(tc->getSystemPtr()); + if (!sys) { + panic("System is not LinuxArmSystem while getting Linux process info!"); + } + std::map& taskMap = sys->taskMap; + + // Create a new unique identifier for this pid + sys->mapPid(tc, pid); + + // Set cpu task id, output process info, and dump stats + tc->getCpuPtr()->taskId(taskMap[pid]); + tc->getCpuPtr()->setPid(pid); + + std::ostream* taskFile = sys->taskFile; + + // Task file is read by cache occupancy plotting script or + // Streamline conversion script. + ccprintf(*taskFile, + "tick=%lld %d cpu_id=%d next_pid=%d next_tgid=%d next_task=%s\n", + curTick(), taskMap[pid], tc->cpuId(), (int) pid, (int) tgid, + next_task_str); + taskFile->flush(); + + // Dump and reset statistics + Stats::schedStatEvent(true, true, curTick(), 0); +} + diff --git a/src/arch/arm/linux/system.hh b/src/arch/arm/linux/system.hh index caf018cb9..feed8cfaa 100644 --- a/src/arch/arm/linux/system.hh +++ b/src/arch/arm/linux/system.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 ARM Limited + * Copyright (c) 2010-2012 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -43,15 +43,24 @@ #ifndef __ARCH_ARM_LINUX_SYSTEM_HH__ #define __ARCH_ARM_LINUX_SYSTEM_HH__ +#include +#include #include #include #include "arch/arm/system.hh" +#include "base/output.hh" #include "kern/linux/events.hh" #include "params/LinuxArmSystem.hh" +#include "sim/core.hh" + +class DumpStatsPCEvent; class LinuxArmSystem : public ArmSystem { + protected: + DumpStatsPCEvent *dumpStatsPCEvent; + public: /** Boilerplate params code */ typedef LinuxArmSystemParams Params; @@ -61,6 +70,20 @@ class LinuxArmSystem : public ArmSystem return dynamic_cast(_params); } + /** When enabled, dump stats/task info on context switches for + * Streamline and per-thread cache occupancy studies, etc. */ + bool enableContextSwitchStatsDump; + + /** This map stores a mapping of OS process IDs to internal Task IDs. The + * mapping is done because the stats system doesn't tend to like vectors + * that are much greater than 1000 items and the entire process space is + * 65K. */ + std::map taskMap; + + /** This is a file that is placed in the run directory that prints out + * mappings between taskIds and OS process IDs */ + std::ostream* taskFile; + LinuxArmSystem(Params *p); ~LinuxArmSystem(); @@ -68,6 +91,12 @@ class LinuxArmSystem : public ArmSystem bool adderBootUncacheable(Addr a); + void startup(); + + /** This function creates a new task Id for the given pid. + * @param tc thread context that is currentyl executing */ + void mapPid(ThreadContext* tc, uint32_t pid); + private: #ifndef NDEBUG /** Event to halt the simulator if the kernel calls panic() */ @@ -97,5 +126,16 @@ class LinuxArmSystem : public ArmSystem Addr penReleaseAddr; }; +class DumpStatsPCEvent : public PCEvent +{ + public: + DumpStatsPCEvent(PCEventQueue *q, const std::string &desc, Addr addr) + : PCEvent(q, desc, addr) + {} + + virtual void process(ThreadContext* tc); +}; + + #endif // __ARCH_ARM_LINUX_SYSTEM_HH__ diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 93c9f8629..aaf9c9cbc 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -118,6 +118,7 @@ BaseCPU::BaseCPU(Params *p, bool is_checker) : MemObject(p), instCnt(0), _cpuId(p->cpu_id), _instMasterId(p->system->getMasterId(name() + ".inst")), _dataMasterId(p->system->getMasterId(name() + ".data")), + _taskId(ContextSwitchTaskId::Unknown), _pid(Request::invldPid), interrupts(p->interrupts), profileEvent(NULL), numThreads(p->numThreads), system(p->system) { @@ -359,6 +360,8 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU) { assert(threadContexts.size() == oldCPU->threadContexts.size()); assert(_cpuId == oldCPU->cpuId()); + _pid = oldCPU->getPid(); + _taskId = oldCPU->taskId(); ThreadID size = threadContexts.size(); for (ThreadID i = 0; i < size; ++i) { @@ -489,6 +492,13 @@ void BaseCPU::serialize(std::ostream &os) { SERIALIZE_SCALAR(instCnt); + + /* Unlike _pid, _taskId is not serialized, as they are dynamically + * assigned unique ids that are only meaningful for the duration of + * a specific run. We will need to serialize the entire taskMap in + * system. */ + SERIALIZE_SCALAR(_pid); + interrupts->serialize(os); } @@ -496,6 +506,7 @@ void BaseCPU::unserialize(Checkpoint *cp, const std::string §ion) { UNSERIALIZE_SCALAR(instCnt); + UNSERIALIZE_SCALAR(_pid); interrupts->unserialize(cp, section); } diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 91cef24ed..6552be0d6 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -103,6 +103,17 @@ class BaseCPU : public MemObject /** data side request id that must be placed in all requests */ MasterID _dataMasterId; + /** An intrenal representation of a task identifier within gem5. This is + * used so the CPU can add which taskId (which is an internal representation + * of the OS process ID) to each request so components in the memory system + * can track which process IDs are ultimately interacting with them + */ + uint32_t _taskId; + + /** The current OS process ID that is executing on this processor. This is + * used to generate a taskId */ + uint32_t _pid; + /** * Define a base class for the CPU ports (instruction and data) * that is refined in the subclasses. This class handles the @@ -174,6 +185,14 @@ class BaseCPU : public MemObject BaseMasterPort &getMasterPort(const std::string &if_name, PortID idx = InvalidPortID); + /** Get cpu task id */ + uint32_t taskId() const { return _taskId; } + /** Set cpu task id */ + void taskId(uint32_t id) { _taskId = id; } + + uint32_t getPid() const { return _pid; } + void setPid(uint32_t pid) { _pid = pid; } + inline void workItemBegin() { numWorkItemsStarted++; } inline void workItemEnd() { numWorkItemsCompleted++; } // @todo remove me after debugging with legion done diff --git a/src/cpu/pc_event.cc b/src/cpu/pc_event.cc index 2b54ee5fb..c957fe4d5 100644 --- a/src/cpu/pc_event.cc +++ b/src/cpu/pc_event.cc @@ -30,7 +30,6 @@ */ #include -#include #include #include diff --git a/src/mem/request.hh b/src/mem/request.hh index f6406e2c5..11f1c74b3 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -1,4 +1,16 @@ /* + * Copyright (c) 2012 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * * Copyright (c) 2002-2005 The Regents of The University of Michigan * All rights reserved. * @@ -47,6 +59,24 @@ #include "base/types.hh" #include "sim/core.hh" +/** + * Special TaskIds that are used for per-context-switch stats dumps + * and Cache Occupancy. Having too many tasks seems to be a problem + * with vector stats. 1024 seems to be a reasonable number that + * doesn't cause a problem with stats and is large enough to realistic + * benchmarks (Linux/Android boot, BBench, etc.) + */ + +namespace ContextSwitchTaskId { + enum TaskId { + MaxNormalTaskId = 1021, /* Maximum number of normal tasks */ + Prefetcher = 1022, /* For cache lines brought in by prefetcher */ + DMA = 1023, /* Mostly Table Walker */ + Unknown = 1024, + NumTaskId + }; +} + class Request; typedef Request* RequestPtr; @@ -117,6 +147,10 @@ class Request static const MasterID invldMasterId = USHRT_MAX; /** @} */ + /** Invalid or unknown Pid. Possible when operating system is not present + * or has not assigned a pid yet */ + static const uint32_t invldPid = UINT_MAX; + private: typedef uint8_t PrivateFlagsType; typedef ::Flags PrivateFlags; diff --git a/src/sim/serialize.hh b/src/sim/serialize.hh index c0c0b63ff..531b2e1cd 100644 --- a/src/sim/serialize.hh +++ b/src/sim/serialize.hh @@ -57,7 +57,7 @@ class SimObject; * SimObject shouldn't cause the version number to increase, only changes to * existing objects such as serializing/unserializing more state, changing sizes * of serialized arrays, etc. */ -static const uint64_t gem5CheckpointVersion = 0x0000000000000002; +static const uint64_t gem5CheckpointVersion = 0x0000000000000003; template void paramOut(std::ostream &os, const std::string &name, const T ¶m); diff --git a/util/cpt_upgrader.py b/util/cpt_upgrader.py index 09e6ef194..ead3d9cbb 100755 --- a/util/cpt_upgrader.py +++ b/util/cpt_upgrader.py @@ -105,9 +105,22 @@ def from_1(cpt): # the system, thus starting at 0 raise ValueError("more than one memory detected (" + sec + ")") +def from_2(cpt): + for sec in cpt.sections(): + import re + # Search for a CPUs + if re.search('.*sys.*cpu', sec): + try: + junk = cpt.get(sec, 'instCnt') + cpt.set(sec, '_pid', '0') + except ConfigParser.NoOptionError: + pass + + migrations = [] migrations.append(from_0) migrations.append(from_1) +migrations.append(from_2) verbose_print = False -- cgit v1.2.3