diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/cpu/trace/SConscript | 12 | ||||
-rw-r--r-- | src/cpu/trace/TraceCPU.py | 71 | ||||
-rw-r--r-- | src/cpu/trace/trace_cpu.cc | 1454 | ||||
-rw-r--r-- | src/cpu/trace/trace_cpu.hh | 1101 |
4 files changed, 2638 insertions, 0 deletions
diff --git a/src/cpu/trace/SConscript b/src/cpu/trace/SConscript new file mode 100644 index 000000000..aa450b14a --- /dev/null +++ b/src/cpu/trace/SConscript @@ -0,0 +1,12 @@ +Import('*') + +if env['TARGET_ISA'] == 'null': + Return() + +# Only build TraceCPU if we have support for protobuf as TraceCPU relies on it +if env['HAVE_PROTOBUF']: + SimObject('TraceCPU.py') + Source('trace_cpu.cc') + +DebugFlag('TraceCPUData') +DebugFlag('TraceCPUInst') diff --git a/src/cpu/trace/TraceCPU.py b/src/cpu/trace/TraceCPU.py new file mode 100644 index 000000000..e1c02ae63 --- /dev/null +++ b/src/cpu/trace/TraceCPU.py @@ -0,0 +1,71 @@ +# Copyright (c) 2013 - 2015 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Radhika Jagtap +# Andreas Hansson +# Thomas Grass + +from m5.params import * +from BaseCPU import BaseCPU + +class TraceCPU(BaseCPU): + """Trace CPU model which replays traces generated in a prior simulation + using DerivO3CPU or its derived classes. It interfaces with L1 caches. + """ + type = 'TraceCPU' + cxx_header = "cpu/trace/trace_cpu.hh" + + @classmethod + def memory_mode(cls): + return 'timing' + + @classmethod + def require_caches(cls): + return True + + def addPMU(self, pmu = None): + pass + + @classmethod + def support_take_over(cls): + return True + + instTraceFile = Param.String("", "Instruction trace file") + dataTraceFile = Param.String("", "Data dependency trace file") + sizeStoreBuffer = Param.Unsigned(16, "Number of entries in the store "\ + "buffer") + sizeLoadBuffer = Param.Unsigned(16, "Number of entries in the load buffer") + sizeROB = Param.Unsigned(40, "Number of entries in the re-order buffer") + diff --git a/src/cpu/trace/trace_cpu.cc b/src/cpu/trace/trace_cpu.cc new file mode 100644 index 000000000..2e989f6ff --- /dev/null +++ b/src/cpu/trace/trace_cpu.cc @@ -0,0 +1,1454 @@ +/* + * Copyright (c) 2013 - 2015 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Radhika Jagtap + * Andreas Hansson + * Thomas Grass + */ + +#include "cpu/trace/trace_cpu.hh" + +#include "sim/sim_exit.hh" + +// Declare and initialize the static counter for number of trace CPUs. +int TraceCPU::numTraceCPUs = 0; + +TraceCPU::TraceCPU(TraceCPUParams *params) + : BaseCPU(params), + icachePort(this), + dcachePort(this), + instMasterID(params->system->getMasterId(name() + ".inst")), + dataMasterID(params->system->getMasterId(name() + ".data")), + instTraceFile(params->instTraceFile), + dataTraceFile(params->dataTraceFile), + icacheGen(*this, ".iside", icachePort, instMasterID, instTraceFile), + dcacheGen(*this, ".dside", dcachePort, dataMasterID, dataTraceFile, + params->sizeROB, params->sizeStoreBuffer, + params->sizeLoadBuffer), + icacheNextEvent(this), + dcacheNextEvent(this), + oneTraceComplete(false), + firstFetchTick(0), + execCompleteEvent(nullptr) +{ + // Increment static counter for number of Trace CPUs. + ++TraceCPU::numTraceCPUs; + + // Check that the python parameters for sizes of ROB, store buffer and load + // buffer do not overflow the corresponding C++ variables. + fatal_if(params->sizeROB > UINT16_MAX, "ROB size set to %d exceeds the " + "max. value of %d.\n", params->sizeROB, UINT16_MAX); + fatal_if(params->sizeStoreBuffer > UINT16_MAX, "ROB size set to %d " + "exceeds the max. value of %d.\n", params->sizeROB, + UINT16_MAX); + fatal_if(params->sizeLoadBuffer > UINT16_MAX, "Load buffer size set to" + " %d exceeds the max. value of %d.\n", + params->sizeLoadBuffer, UINT16_MAX); +} + +TraceCPU::~TraceCPU() +{ + +} + +TraceCPU* +TraceCPUParams::create() +{ + return new TraceCPU(this); +} + +void +TraceCPU::takeOverFrom(BaseCPU *oldCPU) +{ + // Unbind the ports of the old CPU and bind the ports of the TraceCPU. + assert(!getInstPort().isConnected()); + assert(oldCPU->getInstPort().isConnected()); + BaseSlavePort &inst_peer_port = oldCPU->getInstPort().getSlavePort(); + oldCPU->getInstPort().unbind(); + getInstPort().bind(inst_peer_port); + + assert(!getDataPort().isConnected()); + assert(oldCPU->getDataPort().isConnected()); + BaseSlavePort &data_peer_port = oldCPU->getDataPort().getSlavePort(); + oldCPU->getDataPort().unbind(); + getDataPort().bind(data_peer_port); +} + +void +TraceCPU::init() +{ + DPRINTF(TraceCPUInst, "Instruction fetch request trace file is \"%s\"." + "\n", instTraceFile); + DPRINTF(TraceCPUData, "Data memory request trace file is \"%s\".\n", + dataTraceFile); + + BaseCPU::init(); + + // Get the send tick of the first instruction read request and schedule + // icacheNextEvent at that tick. + Tick first_icache_tick = icacheGen.init(); + schedule(icacheNextEvent, first_icache_tick); + + // Get the send tick of the first data read/write request and schedule + // dcacheNextEvent at that tick. + Tick first_dcache_tick = dcacheGen.init(); + schedule(dcacheNextEvent, first_dcache_tick); + + // The static counter for number of Trace CPUs is correctly set at this + // point so create an event and pass it. + execCompleteEvent = new CountedExitEvent("end of all traces reached.", + numTraceCPUs); + // Save the first fetch request tick to dump it as tickOffset + firstFetchTick = first_icache_tick; +} + +void +TraceCPU::schedIcacheNext() +{ + DPRINTF(TraceCPUInst, "IcacheGen event.\n"); + + // Try to send the current packet or a retry packet if there is one + bool sched_next = icacheGen.tryNext(); + // If packet sent successfully, schedule next event + if (sched_next) { + DPRINTF(TraceCPUInst, "Scheduling next icacheGen event " + "at %d.\n", curTick() + icacheGen.tickDelta()); + schedule(icacheNextEvent, curTick() + icacheGen.tickDelta()); + ++numSchedIcacheEvent; + } else { + // check if traceComplete. If not, do nothing because sending failed + // and next event will be scheduled via RecvRetry() + if (icacheGen.isTraceComplete()) { + // If this is the first trace to complete, set the variable. If it + // is already set then both traces are complete to exit sim. + checkAndSchedExitEvent(); + } + } + return; +} + +void +TraceCPU::schedDcacheNext() +{ + DPRINTF(TraceCPUData, "DcacheGen event.\n"); + + dcacheGen.execute(); + if (dcacheGen.isExecComplete()) { + checkAndSchedExitEvent(); + } +} + +void +TraceCPU::checkAndSchedExitEvent() +{ + if (!oneTraceComplete) { + oneTraceComplete = true; + } else { + // Schedule event to indicate execution is complete as both + // instruction and data access traces have been played back. + inform("%s: Execution complete.\n", name()); + + // Record stats which are computed at the end of simulation + tickOffset = firstFetchTick; + numCycles = (clockEdge() - firstFetchTick) / clockPeriod(); + numOps = dcacheGen.getMicroOpCount(); + schedule(*execCompleteEvent, curTick()); + } +} + +void +TraceCPU::regStats() +{ + + BaseCPU::regStats(); + + numSchedDcacheEvent + .name(name() + ".numSchedDcacheEvent") + .desc("Number of events scheduled to trigger data request generator") + ; + + numSchedIcacheEvent + .name(name() + ".numSchedIcacheEvent") + .desc("Number of events scheduled to trigger instruction request generator") + ; + + numOps + .name(name() + ".numOps") + .desc("Number of micro-ops simulated by the Trace CPU") + ; + + cpi + .name(name() + ".cpi") + .desc("Cycles per micro-op used as a proxy for CPI") + .precision(6) + ; + cpi = numCycles/numOps; + + tickOffset + .name(name() + ".tickOffset") + .desc("The first execution tick for the root node of elastic traces") + ; + + icacheGen.regStats(); + dcacheGen.regStats(); +} + +void +TraceCPU::ElasticDataGen::regStats() +{ + using namespace Stats; + + maxDependents + .name(name() + ".maxDependents") + .desc("Max number of dependents observed on a node") + ; + + maxReadyListSize + .name(name() + ".maxReadyListSize") + .desc("Max size of the ready list observed") + ; + + numSendAttempted + .name(name() + ".numSendAttempted") + .desc("Number of first attempts to send a request") + ; + + numSendSucceeded + .name(name() + ".numSendSucceeded") + .desc("Number of successful first attempts") + ; + + numSendFailed + .name(name() + ".numSendFailed") + .desc("Number of failed first attempts") + ; + + numRetrySucceeded + .name(name() + ".numRetrySucceeded") + .desc("Number of successful retries") + ; + + numSplitReqs + .name(name() + ".numSplitReqs") + .desc("Number of split requests") + ; + + numSOLoads + .name(name() + ".numSOLoads") + .desc("Number of strictly ordered loads") + ; + + numSOStores + .name(name() + ".numSOStores") + .desc("Number of strictly ordered stores") + ; + + dataLastTick + .name(name() + ".dataLastTick") + .desc("Last tick simulated from the elastic data trace") + ; +} + +Tick +TraceCPU::ElasticDataGen::init() +{ + DPRINTF(TraceCPUData, "Initializing data memory request generator " + "DcacheGen: elastic issue with retry.\n"); + + if (!readNextWindow()) + panic("Trace has %d elements. It must have at least %d elements.\n", + depGraph.size(), 2 * windowSize); + DPRINTF(TraceCPUData, "After 1st read, depGraph size:%d.\n", + depGraph.size()); + + if (!readNextWindow()) + panic("Trace has %d elements. It must have at least %d elements.\n", + depGraph.size(), 2 * windowSize); + DPRINTF(TraceCPUData, "After 2st read, depGraph size:%d.\n", + depGraph.size()); + + // Print readyList + if (DTRACE(TraceCPUData)) { + printReadyList(); + } + auto free_itr = readyList.begin(); + DPRINTF(TraceCPUData, "Execute tick of the first dependency free node %lli" + " is %d.\n", free_itr->seqNum, free_itr->execTick); + // Return the execute tick of the earliest ready node so that an event + // can be scheduled to call execute() + return (free_itr->execTick); +} + +void +TraceCPU::ElasticDataGen::exit() +{ + trace.reset(); +} + +bool +TraceCPU::ElasticDataGen::readNextWindow() +{ + + // Read and add next window + DPRINTF(TraceCPUData, "Reading next window from file.\n"); + + if (traceComplete) { + // We are at the end of the file, thus we have no more records. + // Return false. + return false; + } + + DPRINTF(TraceCPUData, "Start read: Size of depGraph is %d.\n", + depGraph.size()); + + uint32_t num_read = 0; + while (num_read != windowSize) { + + // Create a new graph node + GraphNode* new_node = new GraphNode; + + // Read the next line to get the next record. If that fails then end of + // trace has been reached and traceComplete needs to be set in addition + // to returning false. + if (!trace.read(new_node)) { + DPRINTF(TraceCPUData, "\tTrace complete!\n"); + traceComplete = true; + return false; + } + + // Annotate the ROB dependencies of the new node onto the parent nodes. + addDepsOnParent(new_node, new_node->robDep, new_node->numRobDep); + // Annotate the register dependencies of the new node onto the parent + // nodes. + addDepsOnParent(new_node, new_node->regDep, new_node->numRegDep); + + num_read++; + // Add to map + depGraph[new_node->seqNum] = new_node; + if (new_node->numRobDep == 0 && new_node->numRegDep == 0) { + // Source dependencies are already complete, check if resources + // are available and issue. The execution time is approximated + // to current time plus the computational delay. + checkAndIssue(new_node); + } + } + + DPRINTF(TraceCPUData, "End read: Size of depGraph is %d.\n", + depGraph.size()); + return true; +} + +template<typename T> void +TraceCPU::ElasticDataGen::addDepsOnParent(GraphNode *new_node, + T& dep_array, uint8_t& num_dep) +{ + for (auto& a_dep : dep_array) { + // The convention is to set the dependencies starting with the first + // index in the ROB and register dependency arrays. Thus, when we reach + // a dependency equal to the initialisation value of zero, we know have + // iterated over all dependencies and can break. + if (a_dep == 0) + break; + // We look up the valid dependency, i.e. the parent of this node + auto parent_itr = depGraph.find(a_dep); + if (parent_itr != depGraph.end()) { + // If the parent is found, it is yet to be executed. Append a + // pointer to the new node to the dependents list of the parent + // node. + parent_itr->second->dependents.push_back(new_node); + auto num_depts = parent_itr->second->dependents.size(); + maxDependents = std::max<double>(num_depts, maxDependents.value()); + } else { + // The dependency is not found in the graph. So consider + // the execution of the parent is complete, i.e. remove this + // dependency. + a_dep = 0; + num_dep--; + } + } +} + +void +TraceCPU::ElasticDataGen::execute() +{ + DPRINTF(TraceCPUData, "Execute start occupancy:\n"); + DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, " + "depFreeQueue = %d ,", depGraph.size(), readyList.size(), + depFreeQueue.size()); + hwResource.printOccupancy(); + + // Read next window to make sure that dependents of all dep-free nodes + // are in the depGraph + if (nextRead) { + readNextWindow(); + nextRead = false; + } + + // First attempt to issue the pending dependency-free nodes held + // in depFreeQueue. If resources have become available for a node, + // then issue it, i.e. add the node to readyList. + while (!depFreeQueue.empty()) { + if (checkAndIssue(depFreeQueue.front(), false)) { + DPRINTF(TraceCPUData, "Removing from depFreeQueue: seq. num " + "%lli.\n", (depFreeQueue.front())->seqNum); + depFreeQueue.pop(); + } else { + break; + } + } + // Proceed to execute from readyList + auto graph_itr = depGraph.begin(); + auto free_itr = readyList.begin(); + // Iterate through readyList until the next free node has its execute + // tick later than curTick or the end of readyList is reached + while (free_itr->execTick <= curTick() && free_itr != readyList.end()) { + + // Get pointer to the node to be executed + graph_itr = depGraph.find(free_itr->seqNum); + assert(graph_itr != depGraph.end()); + GraphNode* node_ptr = graph_itr->second; + + // If there is a retryPkt send that else execute the load + if (retryPkt) { + // The retryPkt must be the request that was created by the + // first node in the readyList. + if (retryPkt->req->getReqInstSeqNum() != node_ptr->seqNum) { + panic("Retry packet's seqence number does not match " + "the first node in the readyList.\n"); + } + if (port.sendTimingReq(retryPkt)) { + ++numRetrySucceeded; + retryPkt = nullptr; + } + } else if (node_ptr->isLoad || node_ptr->isStore) { + // If there is no retryPkt, attempt to send a memory request in + // case of a load or store node. If the send fails, executeMemReq() + // returns a packet pointer, which we save in retryPkt. In case of + // a comp node we don't do anything and simply continue as if the + // execution of the comp node succedded. + retryPkt = executeMemReq(node_ptr); + } + // If the retryPkt or a new load/store node failed, we exit from here + // as a retry from cache will bring the control to execute(). The + // first node in readyList then, will be the failed node. + if (retryPkt) { + break; + } + + // Proceed to remove dependencies for the successfully executed node. + // If it is a load which is not strictly ordered and we sent a + // request for it successfully, we do not yet mark any register + // dependencies complete. But as per dependency modelling we need + // to mark ROB dependencies of load and non load/store nodes which + // are based on successful sending of the load as complete. + if (node_ptr->isLoad && !node_ptr->isStrictlyOrdered()) { + // If execute succeeded mark its dependents as complete + DPRINTF(TraceCPUData, "Node seq. num %lli sent. Waking up " + "dependents..\n", node_ptr->seqNum); + + auto child_itr = (node_ptr->dependents).begin(); + while (child_itr != (node_ptr->dependents).end()) { + // ROB dependency of a store on a load must not be removed + // after load is sent but after response is received + if (!(*child_itr)->isStore && + (*child_itr)->removeRobDep(node_ptr->seqNum)) { + + // Check if the child node has become dependency free + if ((*child_itr)->numRobDep == 0 && + (*child_itr)->numRegDep == 0) { + + // Source dependencies are complete, check if + // resources are available and issue + checkAndIssue(*child_itr); + } + // Remove this child for the sent load and point to new + // location of the element following the erased element + child_itr = node_ptr->dependents.erase(child_itr); + } else { + // This child is not dependency-free, point to the next + // child + child_itr++; + } + } + } else { + // If it is a strictly ordered load mark its dependents as complete + // as we do not send a request for this case. If it is a store or a + // comp node we also mark all its dependents complete. + DPRINTF(TraceCPUData, "Node seq. num %lli done. Waking" + " up dependents..\n", node_ptr->seqNum); + + for (auto child : node_ptr->dependents) { + // If the child node is dependency free removeDepOnInst() + // returns true. + if (child->removeDepOnInst(node_ptr->seqNum)) { + // Source dependencies are complete, check if resources + // are available and issue + checkAndIssue(child); + } + } + } + + // After executing the node, remove from readyList and delete node. + readyList.erase(free_itr); + // If it is a cacheable load which was sent, don't delete + // just yet. Delete it in completeMemAccess() after the + // response is received. If it is an strictly ordered + // load, it was not sent and all dependencies were simply + // marked complete. Thus it is safe to delete it. For + // stores and non load/store nodes all dependencies were + // marked complete so it is safe to delete it. + if (!node_ptr->isLoad || node_ptr->isStrictlyOrdered()) { + // Release all resources occupied by the completed node + hwResource.release(node_ptr); + // clear the dynamically allocated set of dependents + (node_ptr->dependents).clear(); + // delete node + delete node_ptr; + // remove from graph + depGraph.erase(graph_itr); + } + // Point to first node to continue to next iteration of while loop + free_itr = readyList.begin(); + } // end of while loop + + // Print readyList, sizes of queues and resource status after updating + if (DTRACE(TraceCPUData)) { + printReadyList(); + DPRINTF(TraceCPUData, "Execute end occupancy:\n"); + DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, " + "depFreeQueue = %d ,", depGraph.size(), readyList.size(), + depFreeQueue.size()); + hwResource.printOccupancy(); + } + + if (retryPkt) { + DPRINTF(TraceCPUData, "Not scheduling an event as expecting a retry" + "event from the cache for seq. num %lli.\n", + retryPkt->req->getReqInstSeqNum()); + return; + } + // If the size of the dependency graph is less than the dependency window + // then read from the trace file to populate the graph next time we are in + // execute. + if (depGraph.size() < windowSize && !traceComplete) + nextRead = true; + + // If cache is not blocked, schedule an event for the first execTick in + // readyList else retry from cache will schedule the event. If the ready + // list is empty then check if the next pending node has resources + // available to issue. If yes, then schedule an event for the next cycle. + if (!readyList.empty()) { + Tick next_event_tick = std::max(readyList.begin()->execTick, + curTick()); + DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", + next_event_tick); + owner.schedDcacheNextEvent(next_event_tick); + } else if (readyList.empty() && !depFreeQueue.empty() && + hwResource.isAvailable(depFreeQueue.front())) { + DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", + owner.clockEdge(Cycles(1))); + owner.schedDcacheNextEvent(owner.clockEdge(Cycles(1))); + } + + // If trace is completely read, readyList is empty and depGraph is empty, + // set execComplete to true + if (depGraph.empty() && readyList.empty() && traceComplete && + !hwResource.awaitingResponse()) { + DPRINTF(TraceCPUData, "\tExecution Complete!\n"); + execComplete = true; + dataLastTick = curTick(); + } +} + +PacketPtr +TraceCPU::ElasticDataGen::executeMemReq(GraphNode* node_ptr) +{ + + DPRINTF(TraceCPUData, "Executing memory request %lli (addr %d, pc %#x, " + "size %d, flags %d).\n", node_ptr->seqNum, node_ptr->addr, + node_ptr->pc, node_ptr->size, node_ptr->flags); + + // If the request is strictly ordered, do not send it. Just return nullptr + // as if it was succesfully sent. + if (node_ptr->isStrictlyOrdered()) { + node_ptr->isLoad ? ++numSOLoads : ++numSOStores; + DPRINTF(TraceCPUData, "Skipping strictly ordered request %lli.\n", + node_ptr->seqNum); + return nullptr; + } + + // Check if the request spans two cache lines as this condition triggers + // an assert fail in the L1 cache. If it does then truncate the size to + // access only until the end of that line and ignore the remainder. The + // stat counting this is useful to keep a check on how frequently this + // happens. If required the code could be revised to mimick splitting such + // a request into two. + unsigned blk_size = owner.cacheLineSize(); + Addr blk_offset = (node_ptr->addr & (Addr)(blk_size - 1)); + if (!(blk_offset + node_ptr->size <= blk_size)) { + node_ptr->size = blk_size - blk_offset; + ++numSplitReqs; + } + + // Create a request and the packet containing request + Request* req = new Request(node_ptr->addr, node_ptr->size, node_ptr->flags, + masterID, node_ptr->seqNum, + ContextID(0), ThreadID(0)); + req->setPC(node_ptr->pc); + PacketPtr pkt; + uint8_t* pkt_data = new uint8_t[req->getSize()]; + if (node_ptr->isLoad) { + pkt = Packet::createRead(req); + } else { + pkt = Packet::createWrite(req); + memset(pkt_data, 0xA, req->getSize()); + } + pkt->dataDynamic(pkt_data); + + // Call MasterPort method to send a timing request for this packet + bool success = port.sendTimingReq(pkt); + ++numSendAttempted; + + if (!success) { + // If it fails, return the packet to retry when a retry is signalled by + // the cache + ++numSendFailed; + DPRINTF(TraceCPUData, "Send failed. Saving packet for retry.\n"); + return pkt; + } else { + // It is succeeds, return nullptr + ++numSendSucceeded; + return nullptr; + } +} + +bool +TraceCPU::ElasticDataGen::checkAndIssue(const GraphNode* node_ptr, bool first) +{ + // Assert the node is dependency-free + assert(node_ptr->numRobDep == 0 && node_ptr->numRegDep == 0); + + // If this is the first attempt, print a debug message to indicate this. + if (first) { + DPRINTFR(TraceCPUData, "\t\tseq. num %lli(%s) with rob num %lli is now" + " dependency free.\n", node_ptr->seqNum, + node_ptr->isLoad ? "L" : (node_ptr->isStore ? "S" : "C"), + node_ptr->robNum); + } + + // Check if resources are available to issue the specific node + if (hwResource.isAvailable(node_ptr)) { + // If resources are free only then add to readyList + DPRINTFR(TraceCPUData, "\t\tResources available for seq. num %lli. Adding" + " to readyList, occupying resources.\n", node_ptr->seqNum); + // Compute the execute tick by adding the compute delay for the node + // and add the ready node to the ready list + addToSortedReadyList(node_ptr->seqNum, + owner.clockEdge() + node_ptr->compDelay); + // Account for the resources taken up by this issued node. + hwResource.occupy(node_ptr); + return true; + + } else { + if (first) { + // Although dependencies are complete, resources are not available. + DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli." + " Adding to depFreeQueue.\n", node_ptr->seqNum); + depFreeQueue.push(node_ptr); + } else { + DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli. " + "Still pending issue.\n", node_ptr->seqNum); + } + return false; + } +} + +void +TraceCPU::ElasticDataGen::completeMemAccess(PacketPtr pkt) +{ + // Release the resources for this completed node. + if (pkt->isWrite()) { + // Consider store complete. + hwResource.releaseStoreBuffer(); + // If it is a store response then do nothing since we do not model + // dependencies on store completion in the trace. But if we were + // blocking execution due to store buffer fullness, we need to schedule + // an event and attempt to progress. + } else { + // If it is a load response then release the dependents waiting on it. + // Get pointer to the completed load + auto graph_itr = depGraph.find(pkt->req->getReqInstSeqNum()); + assert(graph_itr != depGraph.end()); + GraphNode* node_ptr = graph_itr->second; + + // Release resources occupied by the load + hwResource.release(node_ptr); + + DPRINTF(TraceCPUData, "Load seq. num %lli response received. Waking up" + " dependents..\n", node_ptr->seqNum); + + for (auto child : node_ptr->dependents) { + if (child->removeDepOnInst(node_ptr->seqNum)) { + checkAndIssue(child); + } + } + + // clear the dynamically allocated set of dependents + (node_ptr->dependents).clear(); + // delete node + delete node_ptr; + // remove from graph + depGraph.erase(graph_itr); + } + + if (DTRACE(TraceCPUData)) { + printReadyList(); + } + + // If the size of the dependency graph is less than the dependency window + // then read from the trace file to populate the graph next time we are in + // execute. + if (depGraph.size() < windowSize && !traceComplete) + nextRead = true; + + // If not waiting for retry, attempt to schedule next event + if (!retryPkt) { + // We might have new dep-free nodes in the list which will have execute + // tick greater than or equal to curTick. But a new dep-free node might + // have its execute tick earlier. Therefore, attempt to reschedule. It + // could happen that the readyList is empty and we got here via a + // last remaining response. So, either the trace is complete or there + // are pending nodes in the depFreeQueue. The checking is done in the + // execute() control flow, so schedule an event to go via that flow. + Tick next_event_tick = readyList.empty() ? owner.clockEdge(Cycles(1)) : + std::max(readyList.begin()->execTick, owner.clockEdge(Cycles(1))); + DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", + next_event_tick); + owner.schedDcacheNextEvent(next_event_tick); + } +} + +void +TraceCPU::ElasticDataGen::addToSortedReadyList(NodeSeqNum seq_num, + Tick exec_tick) +{ + ReadyNode ready_node; + ready_node.seqNum = seq_num; + ready_node.execTick = exec_tick; + + // Iterator to readyList + auto itr = readyList.begin(); + + // If the readyList is empty, simply insert the new node at the beginning + // and return + if (itr == readyList.end()) { + readyList.insert(itr, ready_node); + maxReadyListSize = std::max<double>(readyList.size(), + maxReadyListSize.value()); + return; + } + + // If the new node has its execution tick equal to the first node in the + // list then go to the next node. If the first node in the list failed + // to execute, its position as the first is thus maintained. + if (retryPkt) + if (retryPkt->req->getReqInstSeqNum() == itr->seqNum) + itr++; + + // Increment the iterator and compare the node pointed to by it to the new + // node till the position to insert the new node is found. + bool found = false; + while (!found && itr != readyList.end()) { + // If the execution tick of the new node is less than the node then + // this is the position to insert + if (exec_tick < itr->execTick) + found = true; + // If the execution tick of the new node is equal to the node then + // sort in ascending order of sequence numbers + else if (exec_tick == itr->execTick) { + // If the sequence number of the new node is less than the node + // then this is the position to insert + if (seq_num < itr->seqNum) + found = true; + // Else go to next node + else + itr++; + } + // If the execution tick of the new node is greater than the node then + // go to the next node + else + itr++; + } + readyList.insert(itr, ready_node); + // Update the stat for max size reached of the readyList + maxReadyListSize = std::max<double>(readyList.size(), + maxReadyListSize.value()); +} + +void +TraceCPU::ElasticDataGen::printReadyList() { + + auto itr = readyList.begin(); + if (itr == readyList.end()) { + DPRINTF(TraceCPUData, "readyList is empty.\n"); + return; + } + DPRINTF(TraceCPUData, "Printing readyList:\n"); + while (itr != readyList.end()) { + auto graph_itr = depGraph.find(itr->seqNum); + GraphNode* node_ptr M5_VAR_USED = graph_itr->second; + DPRINTFR(TraceCPUData, "\t%lld(%s), %lld\n", itr->seqNum, + node_ptr->isLoad ? "L" : (node_ptr->isStore ? "S" : "C"), + itr->execTick); + itr++; + } +} + +TraceCPU::ElasticDataGen::HardwareResource::HardwareResource( + uint16_t max_rob, uint16_t max_stores, uint16_t max_loads) + : sizeROB(max_rob), + sizeStoreBuffer(max_stores), + sizeLoadBuffer(max_loads), + oldestInFlightRobNum(UINT64_MAX), + numInFlightLoads(0), + numInFlightStores(0) +{} + +void +TraceCPU::ElasticDataGen::HardwareResource::occupy(const GraphNode* new_node) +{ + // Occupy ROB entry for the issued node + // Merely maintain the oldest node, i.e. numerically least robNum by saving + // it in the variable oldestInFLightRobNum. + inFlightNodes[new_node->seqNum] = new_node->robNum; + oldestInFlightRobNum = inFlightNodes.begin()->second; + + // Occupy Load/Store Buffer entry for the issued node if applicable + if (new_node->isLoad) { + ++numInFlightLoads; + } else if (new_node->isStore) { + ++numInFlightStores; + } // else if it is a non load/store node, no buffer entry is occupied + + printOccupancy(); +} + +void +TraceCPU::ElasticDataGen::HardwareResource::release(const GraphNode* done_node) +{ + assert(!inFlightNodes.empty()); + DPRINTFR(TraceCPUData, "\tClearing done seq. num %d from inFlightNodes..\n", + done_node->seqNum); + + assert(inFlightNodes.find(done_node->seqNum) != inFlightNodes.end()); + inFlightNodes.erase(done_node->seqNum); + + if (inFlightNodes.empty()) { + // If we delete the only in-flight node and then the + // oldestInFlightRobNum is set to it's initialized (max) value. + oldestInFlightRobNum = UINT64_MAX; + } else { + // Set the oldest in-flight node rob number equal to the first node in + // the inFlightNodes since that will have the numerically least value. + oldestInFlightRobNum = inFlightNodes.begin()->second; + } + + DPRINTFR(TraceCPUData, "\tCleared. inFlightNodes.size() = %d, " + "oldestInFlightRobNum = %d\n", inFlightNodes.size(), + oldestInFlightRobNum); + + // A store is considered complete when a request is sent, thus ROB entry is + // freed. But it occupies an entry in the Store Buffer until its response + // is received. A load is considered complete when a response is received, + // thus both ROB and Load Buffer entries can be released. + if (done_node->isLoad) { + assert(numInFlightLoads != 0); + --numInFlightLoads; + } + // For normal writes, we send the requests out and clear a store buffer + // entry on response. For writes which are strictly ordered, for e.g. + // writes to device registers, we do that within release() which is called + // when node is executed and taken off from readyList. + if (done_node->isStore && done_node->isStrictlyOrdered()) { + releaseStoreBuffer(); + } +} + +void +TraceCPU::ElasticDataGen::HardwareResource::releaseStoreBuffer() +{ + assert(numInFlightStores != 0); + --numInFlightStores; +} + +bool +TraceCPU::ElasticDataGen::HardwareResource::isAvailable( + const GraphNode* new_node) const +{ + uint16_t num_in_flight_nodes; + if (inFlightNodes.empty()) { + num_in_flight_nodes = 0; + DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" + " #in-flight nodes = 0", new_node->seqNum); + } else if (new_node->robNum > oldestInFlightRobNum) { + // This is the intuitive case where new dep-free node is younger + // instruction than the oldest instruction in-flight. Thus we make sure + // in_flight_nodes does not overflow. + num_in_flight_nodes = new_node->robNum - oldestInFlightRobNum; + DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" + " #in-flight nodes = %d - %d = %d", new_node->seqNum, + new_node->robNum, oldestInFlightRobNum, num_in_flight_nodes); + } else { + // This is the case where an instruction older than the oldest in- + // flight instruction becomes dep-free. Thus we must have already + // accounted for the entry in ROB for this new dep-free node. + // Immediately after this check returns true, oldestInFlightRobNum will + // be updated in occupy(). We simply let this node issue now. + num_in_flight_nodes = 0; + DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" + " new oldestInFlightRobNum = %d, #in-flight nodes ignored", + new_node->seqNum, new_node->robNum); + } + DPRINTFR(TraceCPUData, ", LQ = %d/%d, SQ = %d/%d.\n", + numInFlightLoads, sizeLoadBuffer, + numInFlightStores, sizeStoreBuffer); + // Check if resources are available to issue the specific node + if (num_in_flight_nodes >= sizeROB) { + return false; + } + if (new_node->isLoad && numInFlightLoads >= sizeLoadBuffer) { + return false; + } + if (new_node->isStore && numInFlightStores >= sizeStoreBuffer) { + return false; + } + return true; +} + +bool +TraceCPU::ElasticDataGen::HardwareResource::awaitingResponse() const { + // Return true if there is at least one read or write request in flight + return (numInFlightStores != 0 || numInFlightLoads != 0); +} + +void +TraceCPU::ElasticDataGen::HardwareResource::printOccupancy() { + DPRINTFR(TraceCPUData, "oldestInFlightRobNum = %d, " + "LQ = %d/%d, SQ = %d/%d.\n", + oldestInFlightRobNum, + numInFlightLoads, sizeLoadBuffer, + numInFlightStores, sizeStoreBuffer); +} + +void +TraceCPU::FixedRetryGen::regStats() +{ + using namespace Stats; + + numSendAttempted + .name(name() + ".numSendAttempted") + .desc("Number of first attempts to send a request") + ; + + numSendSucceeded + .name(name() + ".numSendSucceeded") + .desc("Number of successful first attempts") + ; + + numSendFailed + .name(name() + ".numSendFailed") + .desc("Number of failed first attempts") + ; + + numRetrySucceeded + .name(name() + ".numRetrySucceeded") + .desc("Number of successful retries") + ; + + instLastTick + .name(name() + ".instLastTick") + .desc("Last tick simulated from the fixed inst trace") + ; +} + +Tick +TraceCPU::FixedRetryGen::init() +{ + DPRINTF(TraceCPUInst, "Initializing instruction fetch request generator" + " IcacheGen: fixed issue with retry.\n"); + + if (nextExecute()) { + DPRINTF(TraceCPUInst, "\tFirst tick = %d.\n", currElement.tick); + return currElement.tick; + } else { + panic("Read of first message in the trace failed.\n"); + return MaxTick; + } +} + +bool +TraceCPU::FixedRetryGen::tryNext() +{ + // If there is a retry packet, try to send it + if (retryPkt) { + + DPRINTF(TraceCPUInst, "Trying to send retry packet.\n"); + + if (!port.sendTimingReq(retryPkt)) { + // Still blocked! This should never occur. + DPRINTF(TraceCPUInst, "Retry packet sending failed.\n"); + return false; + } + ++numRetrySucceeded; + } else { + + DPRINTF(TraceCPUInst, "Trying to send packet for currElement.\n"); + + // try sending current element + assert(currElement.isValid()); + + ++numSendAttempted; + + if (!send(currElement.addr, currElement.blocksize, + currElement.cmd, currElement.flags, currElement.pc)) { + DPRINTF(TraceCPUInst, "currElement sending failed.\n"); + ++numSendFailed; + // return false to indicate not to schedule next event + return false; + } else { + ++numSendSucceeded; + } + } + // If packet was sent successfully, either retryPkt or currElement, return + // true to indicate to schedule event at current Tick plus delta. If packet + // was sent successfully and there is no next packet to send, return false. + DPRINTF(TraceCPUInst, "Packet sent successfully, trying to read next " + "element.\n"); + retryPkt = nullptr; + // Read next element into currElement, currElement gets cleared so save the + // tick to calculate delta + Tick last_tick = currElement.tick; + if (nextExecute()) { + assert(currElement.tick >= last_tick); + delta = currElement.tick - last_tick; + } + return !traceComplete; +} + +void +TraceCPU::FixedRetryGen::exit() +{ + trace.reset(); +} + +bool +TraceCPU::FixedRetryGen::nextExecute() +{ + if (traceComplete) + // We are at the end of the file, thus we have no more messages. + // Return false. + return false; + + + //Reset the currElement to the default values + currElement.clear(); + + // Read the next line to get the next message. If that fails then end of + // trace has been reached and traceComplete needs to be set in addition + // to returning false. If successful then next message is in currElement. + if (!trace.read(&currElement)) { + traceComplete = true; + instLastTick = curTick(); + return false; + } + + DPRINTF(TraceCPUInst, "inst fetch: %c addr %d pc %#x size %d tick %d\n", + currElement.cmd.isRead() ? 'r' : 'w', + currElement.addr, + currElement.pc, + currElement.blocksize, + currElement.tick); + + return true; +} + +bool +TraceCPU::FixedRetryGen::send(Addr addr, unsigned size, const MemCmd& cmd, + Request::FlagsType flags, Addr pc) +{ + + // Create new request + Request* req = new Request(addr, size, flags, masterID); + req->setPC(pc); + + // If this is not done it triggers assert in L1 cache for invalid contextId + req->setThreadContext(ContextID(0), ThreadID(0)); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, cmd); + + uint8_t* pkt_data = new uint8_t[req->getSize()]; + pkt->dataDynamic(pkt_data); + + if (cmd.isWrite()) { + memset(pkt_data, 0xA, req->getSize()); + } + + // Call MasterPort method to send a timing request for this packet + bool success = port.sendTimingReq(pkt); + if (!success) { + // If it fails, save the packet to retry when a retry is signalled by + // the cache + retryPkt = pkt; + } + return success; +} + +void +TraceCPU::icacheRetryRecvd() +{ + // Schedule an event to go through the control flow in the same tick as + // retry is received + DPRINTF(TraceCPUInst, "Icache retry received. Scheduling next IcacheGen" + " event @%lli.\n", curTick()); + schedule(icacheNextEvent, curTick()); +} + +void +TraceCPU::dcacheRetryRecvd() +{ + // Schedule an event to go through the execute flow in the same tick as + // retry is received + DPRINTF(TraceCPUData, "Dcache retry received. Scheduling next DcacheGen" + " event @%lli.\n", curTick()); + schedule(dcacheNextEvent, curTick()); +} + +void +TraceCPU::schedDcacheNextEvent(Tick when) +{ + if (!dcacheNextEvent.scheduled()) { + DPRINTF(TraceCPUData, "Scheduling next DcacheGen event at %lli.\n", + when); + schedule(dcacheNextEvent, when); + ++numSchedDcacheEvent; + } else if (when < dcacheNextEvent.when()) { + DPRINTF(TraceCPUData, "Re-scheduling next dcache event from %lli" + " to %lli.\n", dcacheNextEvent.when(), when); + reschedule(dcacheNextEvent, when); + } + +} + +bool +TraceCPU::IcachePort::recvTimingResp(PacketPtr pkt) +{ + // All responses on the instruction fetch side are ignored. Simply delete + // the request and packet to free allocated memory + delete pkt->req; + delete pkt; + + return true; +} + +void +TraceCPU::IcachePort::recvReqRetry() +{ + owner->icacheRetryRecvd(); +} + +void +TraceCPU::dcacheRecvTimingResp(PacketPtr pkt) +{ + DPRINTF(TraceCPUData, "Received timing response from Dcache.\n"); + dcacheGen.completeMemAccess(pkt); +} + +bool +TraceCPU::DcachePort::recvTimingResp(PacketPtr pkt) +{ + // Handle the responses for data memory requests which is done inside the + // elastic data generator + owner->dcacheRecvTimingResp(pkt); + // After processing the response delete the request and packet to free + // memory + delete pkt->req; + delete pkt; + + return true; +} + +void +TraceCPU::DcachePort::recvReqRetry() +{ + owner->dcacheRetryRecvd(); +} + +TraceCPU::ElasticDataGen::InputStream::InputStream(const std::string& filename) + : trace(filename), + microOpCount(0) +{ + // Create a protobuf message for the header and read it from the stream + ProtoMessage::InstDepRecordHeader header_msg; + if (!trace.read(header_msg)) { + panic("Failed to read packet header from %s\n", filename); + + if (header_msg.tick_freq() != SimClock::Frequency) { + panic("Trace %s was recorded with a different tick frequency %d\n", + header_msg.tick_freq()); + } + } else { + // Assign window size equal to the field in the trace that was recorded + // when the data dependency trace was captured in the o3cpu model + windowSize = header_msg.window_size(); + } +} + +void +TraceCPU::ElasticDataGen::InputStream::reset() +{ + trace.reset(); +} + +bool +TraceCPU::ElasticDataGen::InputStream::read(GraphNode* element) +{ + ProtoMessage::InstDepRecord pkt_msg; + if (trace.read(pkt_msg)) { + // Required fields + element->seqNum = pkt_msg.seq_num(); + element->isLoad = pkt_msg.load(); + element->isStore = pkt_msg.store(); + element->compDelay = pkt_msg.comp_delay(); + + // Repeated field robDepList + element->clearRobDep(); + assert((pkt_msg.rob_dep()).size() <= element->maxRobDep); + for (int i = 0; i < (pkt_msg.rob_dep()).size(); i++) { + element->robDep[element->numRobDep] = pkt_msg.rob_dep(i); + element->numRobDep += 1; + } + + // Repeated field + element->clearRegDep(); + assert((pkt_msg.reg_dep()).size() <= TheISA::MaxInstSrcRegs); + for (int i = 0; i < (pkt_msg.reg_dep()).size(); i++) { + // There is a possibility that an instruction has both, a register + // and order dependency on an instruction. In such a case, the + // register dependency is omitted + bool duplicate = false; + for (int j = 0; j < element->numRobDep; j++) { + duplicate |= (pkt_msg.reg_dep(i) == element->robDep[j]); + } + if (!duplicate) { + element->regDep[element->numRegDep] = pkt_msg.reg_dep(i); + element->numRegDep += 1; + } + } + + // Optional fields + if (pkt_msg.has_addr()) + element->addr = pkt_msg.addr(); + else + element->addr = 0; + + if (pkt_msg.has_size()) + element->size = pkt_msg.size(); + else + element->size = 0; + + if (pkt_msg.has_flags()) + element->flags = pkt_msg.flags(); + else + element->flags = 0; + + if (pkt_msg.has_pc()) + element->pc = pkt_msg.pc(); + else + element->pc = 0; + + // ROB occupancy number + ++microOpCount; + if (pkt_msg.has_weight()) { + microOpCount += pkt_msg.weight(); + } + element->robNum = microOpCount; + return true; + } + + // We have reached the end of the file + return false; +} + +bool +TraceCPU::ElasticDataGen::GraphNode::removeRegDep(NodeSeqNum reg_dep) +{ + for (auto& own_reg_dep : regDep) { + if (own_reg_dep == reg_dep) { + // If register dependency is found, make it zero and return true + own_reg_dep = 0; + --numRegDep; + assert(numRegDep >= 0); + DPRINTFR(TraceCPUData, "\tFor %lli: Marking register dependency %lli " + "done.\n", seqNum, reg_dep); + return true; + } + } + + // Return false if the dependency is not found + return false; +} + +bool +TraceCPU::ElasticDataGen::GraphNode::removeRobDep(NodeSeqNum rob_dep) +{ + for (auto& own_rob_dep : robDep) { + if (own_rob_dep == rob_dep) { + // If the rob dependency is found, make it zero and return true + own_rob_dep = 0; + --numRobDep; + assert(numRobDep >= 0); + DPRINTFR(TraceCPUData, "\tFor %lli: Marking ROB dependency %lli " + "done.\n", seqNum, rob_dep); + return true; + } + } + return false; +} + +void +TraceCPU::ElasticDataGen::GraphNode::clearRegDep() { + for (auto& own_reg_dep : regDep) { + own_reg_dep = 0; + } + numRegDep = 0; +} + +void +TraceCPU::ElasticDataGen::GraphNode::clearRobDep() { + for (auto& own_rob_dep : robDep) { + own_rob_dep = 0; + } + numRobDep = 0; +} + +bool +TraceCPU::ElasticDataGen::GraphNode::removeDepOnInst(NodeSeqNum done_seq_num) +{ + // If it is an rob dependency then remove it + if (!removeRobDep(done_seq_num)) { + // If it is not an rob dependency then it must be a register dependency + // If the register dependency is not found, it violates an assumption + // and must be caught by assert. + bool regdep_found M5_VAR_USED = removeRegDep(done_seq_num); + assert(regdep_found); + } + // Return true if the node is dependency free + return (numRobDep == 0 && numRegDep == 0); +} + +void +TraceCPU::ElasticDataGen::GraphNode::writeElementAsTrace() const +{ + DPRINTFR(TraceCPUData, "%lli", seqNum); + DPRINTFR(TraceCPUData, ",%s", (isLoad ? "True" : "False")); + DPRINTFR(TraceCPUData, ",%s", (isStore ? "True" : "False")); + if (isLoad || isStore) { + DPRINTFR(TraceCPUData, ",%i", addr); + DPRINTFR(TraceCPUData, ",%i", size); + DPRINTFR(TraceCPUData, ",%i", flags); + } + DPRINTFR(TraceCPUData, ",%lli", compDelay); + int i = 0; + DPRINTFR(TraceCPUData, "robDep:"); + while (robDep[i] != 0) { + DPRINTFR(TraceCPUData, ",%lli", robDep[i]); + i++; + } + i = 0; + DPRINTFR(TraceCPUData, "regDep:"); + while (regDep[i] != 0) { + DPRINTFR(TraceCPUData, ",%lli", regDep[i]); + i++; + } + auto child_itr = dependents.begin(); + DPRINTFR(TraceCPUData, "dependents:"); + while (child_itr != dependents.end()) { + DPRINTFR(TraceCPUData, ":%lli", (*child_itr)->seqNum); + child_itr++; + } + + DPRINTFR(TraceCPUData, "\n"); +} + +TraceCPU::FixedRetryGen::InputStream::InputStream(const std::string& filename) + : trace(filename) +{ + // Create a protobuf message for the header and read it from the stream + ProtoMessage::PacketHeader header_msg; + if (!trace.read(header_msg)) { + panic("Failed to read packet header from %s\n", filename); + + if (header_msg.tick_freq() != SimClock::Frequency) { + panic("Trace %s was recorded with a different tick frequency %d\n", + header_msg.tick_freq()); + } + } +} + +void +TraceCPU::FixedRetryGen::InputStream::reset() +{ + trace.reset(); +} + +bool +TraceCPU::FixedRetryGen::InputStream::read(TraceElement* element) +{ + ProtoMessage::Packet pkt_msg; + if (trace.read(pkt_msg)) { + element->cmd = pkt_msg.cmd(); + element->addr = pkt_msg.addr(); + element->blocksize = pkt_msg.size(); + element->tick = pkt_msg.tick(); + element->flags = pkt_msg.has_flags() ? pkt_msg.flags() : 0; + element->pc = pkt_msg.has_pc() ? pkt_msg.pc() : 0; + return true; + } + + // We have reached the end of the file + return false; +} diff --git a/src/cpu/trace/trace_cpu.hh b/src/cpu/trace/trace_cpu.hh new file mode 100644 index 000000000..3a869ebe0 --- /dev/null +++ b/src/cpu/trace/trace_cpu.hh @@ -0,0 +1,1101 @@ +/* + * Copyright (c) 2013 - 2015 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Radhika Jagtap + * Andreas Hansson + * Thomas Grass + */ + +#ifndef __CPU_TRACE_TRACE_CPU_HH__ +#define __CPU_TRACE_TRACE_CPU_HH__ + +#include <array> +#include <cstdint> +#include <queue> +#include <set> +#include <unordered_map> + +#include "arch/registers.hh" +#include "base/statistics.hh" +#include "cpu/base.hh" +#include "debug/TraceCPUData.hh" +#include "debug/TraceCPUInst.hh" +#include "params/TraceCPU.hh" +#include "proto/inst_dep_record.pb.h" +#include "proto/packet.pb.h" +#include "proto/protoio.hh" +#include "sim/sim_events.hh" + +/** + * The trace cpu replays traces generated using the elastic trace probe + * attached to the O3 CPU model. The elastic trace is an execution trace with + * register data dependencies and ordering dependencies annotated to it. The + * trace cpu also replays a fixed timestamp fetch trace that is also generated + * by the elastic trace probe. This trace cpu model aims at achieving faster + * simulation compared to the detailed cpu model and good correlation when the + * same trace is used for playback on different memory sub-systems. + * + * The TraceCPU inherits from BaseCPU so some virtual methods need to be + * defined. It has two port subclasses inherited from MasterPort for + * instruction and data ports. It issues the memory requests deducing the + * timing from the trace and without performing real execution of micro-ops. As + * soon as the last dependency for an instruction is complete, its + * computational delay, also provided in the input trace is added. The + * dependency-free nodes are maintained in a list, called 'ReadyList', ordered + * by ready time. Instructions which depend on load stall until the responses + * for read requests are received thus achieving elastic replay. If the + * dependency is not found when adding a new node, it is assumed complete. + * Thus, if this node is found to be completely dependency-free its issue time + * is calculated and it is added to the ready list immediately. This is + * encapsulated in the subclass ElasticDataGen. + * + * If ready nodes are issued in an unconstrained way there can be more nodes + * outstanding which results in divergence in timing compared to the O3CPU. + * Therefore, the Trace CPU also models hardware resources. A sub-class to + * model hardware resources contains the maximum sizes of load buffer, store + * buffer and ROB. If resources are not available, the node is not issued. Such + * nodes that are pending issue are held in the 'depFreeQueue' structure. + * + * Modeling the ROB size in the Trace CPU as a resource limitation is arguably + * the most important parameter of all resources. The ROB occupancy is + * estimated using the newly added field 'robNum'. We need to use ROB number as + * sequence number is at times much higher due to squashing and trace replay is + * focused on correct path modeling. + * + * A map called 'inFlightNodes' is added to track nodes that are not only in + * the readyList but also load nodes that are executed (and thus removed from + * readyList) but are not complete. ReadyList handles what and when to execute + * next node while the inFlightNodes is used for resource modelling. The oldest + * ROB number is updated when any node occupies the ROB or when an entry in the + * ROB is released. The ROB occupancy is equal to the difference in the ROB + * number of the newly dependency-free node and the oldest ROB number in + * flight. + * + * If no node depends on a non load/store node then there is no reason to + * track it in the dependency graph. We filter out such nodes but count them + * and add a weight field to the subsequent node that we do include in the + * trace. The weight field is used to model ROB occupancy during replay. + * + * The depFreeQueue is chosen to be FIFO so that child nodes which are in + * program order get pushed into it in that order and thus issued in program + * order, like in the O3CPU. This is also why the dependents is made a + * sequential container, std::set to std::vector. We only check head of the + * depFreeQueue as nodes are issued in order and blocking on head models that + * better than looping the entire queue. An alternative choice would be to + * inspect top N pending nodes where N is the issue-width. This is left for + * future as the timing correlation looks good as it is. + * + * At the start of an execution event, first we attempt to issue such pending + * nodes by checking if appropriate resources have become available. If yes, we + * compute the execute tick with respect to the time then. Then we proceed to + * complete nodes from the readyList. + * + * When a read response is received, sometimes a dependency on it that was + * supposed to be released when it was issued is still not released. This + * occurs because the dependent gets added to the graph after the read was + * sent. So the check is made less strict and the dependency is marked complete + * on read response instead of insisting that it should have been removed on + * read sent. + * + * There is a check for requests spanning two cache lines as this condition + * triggers an assert fail in the L1 cache. If it does then truncate the size + * to access only until the end of that line and ignore the remainder. + * Strictly-ordered requests are skipped and the dependencies on such requests + * are handled by simply marking them complete immediately. + * + * The simulated seconds can be calculated as the difference between the + * final_tick stat and the tickOffset stat. A CountedExitEvent that contains a + * static int belonging to the Trace CPU class as a down counter is used to + * implement multi Trace CPU simulation exit. + */ + +class TraceCPU : public BaseCPU +{ + + public: + TraceCPU(TraceCPUParams *params); + ~TraceCPU(); + + void init(); + + /** + * This is a pure virtual function in BaseCPU. As we don't know how many + * insts are in the trace but only know how how many micro-ops are we + * cannot count this stat. + * + * @return 0 + */ + Counter totalInsts() const + { + return 0; + } + + /** + * Return totalOps as the number of committed micro-ops plus the + * speculatively issued loads that are modelled in the TraceCPU replay. + * + * @return number of micro-ops i.e. nodes in the elastic data generator + */ + Counter totalOps() const + { + return dcacheGen.getMicroOpCount(); + } + + /* Pure virtual function in BaseCPU. Do nothing. */ + void wakeup(ThreadID tid = 0) + { + return; + } + + /* + * When resuming from checkpoint in FS mode, the TraceCPU takes over from + * the old cpu. This function overrides the takeOverFrom() function in the + * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the + * TraceCPU. + */ + void takeOverFrom(BaseCPU *oldCPU); + + /** + * When instruction cache port receives a retry, schedule event + * icacheNextEvent. + */ + void icacheRetryRecvd(); + + /** + * When data cache port receives a retry, schedule event + * dcacheNextEvent. + */ + void dcacheRetryRecvd(); + + /** + * When data cache port receives a response, this calls the dcache + * generator method handle to complete the load writeback. + * + * @param pkt Pointer to packet received + */ + void dcacheRecvTimingResp(PacketPtr pkt); + + /** + * Schedule event dcacheNextEvent at the given tick + * + * @param when Tick at which to schedule event + */ + void schedDcacheNextEvent(Tick when); + + protected: + + /** + * IcachePort class that interfaces with L1 Instruction Cache. + */ + class IcachePort : public MasterPort + { + public: + /** Default constructor. */ + IcachePort(TraceCPU* _cpu) + : MasterPort(_cpu->name() + ".icache_port", _cpu), + owner(_cpu) + { } + + public: + /** + * Receive the timing reponse and simply delete the packet since + * instruction fetch requests are issued as per the timing in the trace + * and responses are ignored. + * + * @param pkt Pointer to packet received + * @return true + */ + bool recvTimingResp(PacketPtr pkt); + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvTimingSnoopReq(PacketPtr pkt) { } + + /** + * Handle a retry signalled by the cache if instruction read failed in + * the first attempt. + */ + void recvReqRetry(); + + private: + TraceCPU* owner; + }; + + /** + * DcachePort class that interfaces with L1 Data Cache. + */ + class DcachePort : public MasterPort + { + + public: + /** Default constructor. */ + DcachePort(TraceCPU* _cpu) + : MasterPort(_cpu->name() + ".dcache_port", _cpu), + owner(_cpu) + { } + + public: + + /** + * Receive the timing reponse and call dcacheRecvTimingResp() method + * of the dcacheGen to handle completing the load + * + * @param pkt Pointer to packet received + * @return true + */ + bool recvTimingResp(PacketPtr pkt); + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvTimingSnoopReq(PacketPtr pkt) + { } + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvFunctionalSnoop(PacketPtr pkt) + { } + + /** + * Handle a retry signalled by the cache if data access failed in the + * first attempt. + */ + void recvReqRetry(); + + /** + * Required functionally. + * + * @return true since we have to snoop + */ + bool isSnooping() const { return true; } + + private: + TraceCPU* owner; + }; + + /** Port to connect to L1 instruction cache. */ + IcachePort icachePort; + + /** Port to connect to L1 data cache. */ + DcachePort dcachePort; + + /** Master id for instruction read requests. */ + const MasterID instMasterID; + + /** Master id for data read and write requests. */ + const MasterID dataMasterID; + + /** File names for input instruction and data traces. */ + std::string instTraceFile, dataTraceFile; + + /** + * Generator to read protobuf trace containing memory requests at fixed + * timestamps, perform flow control and issue memory requests. If L1 cache + * port sends packet succesfully, determine the tick to send the next + * packet else wait for retry from cache. + */ + class FixedRetryGen + { + + private: + + /** + * This struct stores a line in the trace file. + */ + struct TraceElement { + + /** Specifies if the request is to be a read or a write */ + MemCmd cmd; + + /** The address for the request */ + Addr addr; + + /** The size of the access for the request */ + Addr blocksize; + + /** The time at which the request should be sent */ + Tick tick; + + /** Potential request flags to use */ + Request::FlagsType flags; + + /** Instruction PC */ + Addr pc; + + /** + * Check validity of this element. + * + * @return if this element is valid + */ + bool isValid() const { + return cmd != MemCmd::InvalidCmd; + } + + /** + * Make this element invalid. + */ + void clear() { + cmd = MemCmd::InvalidCmd; + } + }; + + /** + * The InputStream encapsulates a trace file and the + * internal buffers and populates TraceElements based on + * the input. + */ + class InputStream + { + + private: + + // Input file stream for the protobuf trace + ProtoInputStream trace; + + public: + + /** + * Create a trace input stream for a given file name. + * + * @param filename Path to the file to read from + */ + InputStream(const std::string& filename); + + /** + * Reset the stream such that it can be played once + * again. + */ + void reset(); + + /** + * Attempt to read a trace element from the stream, + * and also notify the caller if the end of the file + * was reached. + * + * @param element Trace element to populate + * @return True if an element could be read successfully + */ + bool read(TraceElement* element); + }; + + public: + /* Constructor */ + FixedRetryGen(TraceCPU& _owner, const std::string& _name, + MasterPort& _port, MasterID master_id, + const std::string& trace_file) + : owner(_owner), + port(_port), + masterID(master_id), + trace(trace_file), + genName(owner.name() + ".fixedretry" + _name), + retryPkt(nullptr), + delta(0), + traceComplete(false) + { + } + + /** + * Called from TraceCPU init(). Reads the first message from the + * input trace file and returns the send tick. + * + * @return Tick when first packet must be sent + */ + Tick init(); + + /** + * This tries to send current or retry packet and returns true if + * successfull. It calls nextExecute() to read next message. + * + * @return bool true if packet is sent successfully + */ + bool tryNext(); + + /** Returns name of the FixedRetryGen instance. */ + const std::string& name() const { return genName; } + + /** + * Creates a new request assigning the request parameters passed by the + * arguments. Calls the port's sendTimingReq() and returns true if + * the packet was sent succesfully. It is called by tryNext() + * + * @param addr address of request + * @param size size of request + * @param cmd if it is a read or write request + * @param flags associated request flags + * @param pc instruction PC that generated the request + * + * @return true if packet was sent successfully + */ + bool send(Addr addr, unsigned size, const MemCmd& cmd, + Request::FlagsType flags, Addr pc); + + /** Exit the FixedRetryGen. */ + void exit(); + + /** + * Reads a line of the trace file. Returns the tick + * when the next request should be generated. If the end + * of the file has been reached, it returns false. + * + * @return bool false id end of file has been reached + */ + bool nextExecute(); + + /** + * Returns the traceComplete variable which is set when end of the + * input trace file is reached. + * + * @return bool true if traceComplete is set, false otherwise. + */ + bool isTraceComplete() { return traceComplete; } + + int64_t tickDelta() { return delta; } + + void regStats(); + + private: + + /** Reference of the TraceCPU. */ + TraceCPU& owner; + + /** Reference of the port to be used to issue memory requests. */ + MasterPort& port; + + /** MasterID used for the requests being sent. */ + const MasterID masterID; + + /** Input stream used for reading the input trace file. */ + InputStream trace; + + /** String to store the name of the FixedRetryGen. */ + std::string genName; + + /** PacketPtr used to store the packet to retry. */ + PacketPtr retryPkt; + + /** + * Stores the difference in the send ticks of the current and last + * packets. Keeping this signed to check overflow to a negative value + * which will be caught by assert(delta > 0) + */ + int64_t delta; + + /** + * Set to true when end of trace is reached. + */ + bool traceComplete; + + /** Store an element read from the trace to send as the next packet. */ + TraceElement currElement; + + /** Stats for instruction accesses replayed. */ + Stats::Scalar numSendAttempted; + Stats::Scalar numSendSucceeded; + Stats::Scalar numSendFailed; + Stats::Scalar numRetrySucceeded; + /** Last simulated tick by the FixedRetryGen */ + Stats::Scalar instLastTick; + + }; + + /** + * The elastic data memory request generator to read protobuf trace + * containing execution trace annotated with data and ordering + * dependencies. It deduces the time at which to send a load/store request + * by tracking the dependencies. It attempts to send a memory request for a + * load/store without performing real execution of micro-ops. If L1 cache + * port sends packet succesfully, the generator checks which instructions + * became dependency free as a result of this and schedules an event + * accordingly. If it fails to send the packet, it waits for a retry from + * the cache. + */ + class ElasticDataGen + { + + private: + + /** Node sequence number type. */ + typedef uint64_t NodeSeqNum; + + /** Node ROB number type. */ + typedef uint64_t NodeRobNum; + + /** + * The struct GraphNode stores an instruction in the trace file. The + * format of the trace file favours constructing a dependency graph of + * the execution and this struct is used to encapsulate the request + * data as well as pointers to its dependent GraphNodes. + */ + class GraphNode { + + public: + /** + * The maximum no. of ROB dependencies. There can be at most 2 + * order dependencies which could exist for a store. For a load + * and comp node there can be at most one order dependency. + */ + static const uint8_t maxRobDep = 2; + + /** Typedef for the array containing the ROB dependencies */ + typedef std::array<NodeSeqNum, maxRobDep> RobDepArray; + + /** Typedef for the array containing the register dependencies */ + typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray; + + /** Instruction sequence number */ + NodeSeqNum seqNum; + + /** ROB occupancy number */ + NodeRobNum robNum; + + /** If instruction is a load */ + bool isLoad; + + /** If instruction is a store */ + bool isStore; + + /** The address for the request if any */ + Addr addr; + + /** Size of request if any */ + uint32_t size; + + /** Request flags if any */ + Request::Flags flags; + + /** Instruction PC */ + Addr pc; + + /** Array of order dependencies. */ + RobDepArray robDep; + + /** Number of order dependencies */ + uint8_t numRobDep; + + /** Computational delay */ + uint64_t compDelay; + + /** + * Array of register dependencies (incoming) if any. Maximum number + * of source registers used to set maximum size of the array + */ + RegDepArray regDep; + + /** Number of register dependencies */ + uint8_t numRegDep; + + /** + * A vector of nodes dependent (outgoing) on this node. A + * sequential container is chosen because when dependents become + * free, they attempt to issue in program order. + */ + std::vector<GraphNode *> dependents; + + /** Initialize register dependency array to all zeroes */ + void clearRegDep(); + + /** Initialize register dependency array to all zeroes */ + void clearRobDep(); + + /** Remove completed instruction from register dependency array */ + bool removeRegDep(NodeSeqNum reg_dep); + + /** Remove completed instruction from order dependency array */ + bool removeRobDep(NodeSeqNum rob_dep); + + /** Check for all dependencies on completed inst */ + bool removeDepOnInst(NodeSeqNum done_seq_num); + + /** Return true if node has a request which is strictly ordered */ + bool isStrictlyOrdered() const { + return (flags.isSet(Request::STRICT_ORDER)); + } + /** + * Write out element in trace-compatible format using debug flag + * TraceCPUData. + */ + void writeElementAsTrace() const; + }; + + /** Struct to store a ready-to-execute node and its execution tick. */ + struct ReadyNode + { + /** The sequence number of the ready node */ + NodeSeqNum seqNum; + + /** The tick at which the ready node must be executed */ + Tick execTick; + }; + + /** + * The HardwareResource class models structures that hold the in-flight + * nodes. When a node becomes dependency free, first check if resources + * are available to issue it. + */ + class HardwareResource + { + public: + /** + * Constructor that initializes the sizes of the structures. + * + * @param max_rob size of the Reorder Buffer + * @param max_stores size of Store Buffer + * @param max_loads size of Load Buffer + */ + HardwareResource(uint16_t max_rob, uint16_t max_stores, + uint16_t max_loads); + + /** + * Occupy appropriate structures for an issued node. + * + * @param node_ptr pointer to the issued node + */ + void occupy(const GraphNode* new_node); + + /** + * Release appropriate structures for a completed node. + * + * @param node_ptr pointer to the completed node + */ + void release(const GraphNode* done_node); + + /** Release store buffer entry for a completed store */ + void releaseStoreBuffer(); + + /** + * Check if structures required to issue a node are free. + * + * @param node_ptr pointer to the node ready to issue + * @return true if resources are available + */ + bool isAvailable(const GraphNode* new_node) const; + + /** + * Check if there are any outstanding requests, i.e. requests for + * which we are yet to receive a response. + * + * @return true if there is at least one read or write request + * outstanding + */ + bool awaitingResponse() const; + + /** Print resource occupancy for debugging */ + void printOccupancy(); + + private: + /** + * The size of the ROB used to throttle the max. number of in-flight + * nodes. + */ + const uint16_t sizeROB; + + /** + * The size of store buffer. This is used to throttle the max. number + * of in-flight stores. + */ + const uint16_t sizeStoreBuffer; + + /** + * The size of load buffer. This is used to throttle the max. number + * of in-flight loads. + */ + const uint16_t sizeLoadBuffer; + + /** + * A map from the sequence number to the ROB number of the in- + * flight nodes. This includes all nodes that are in the readyList + * plus the loads for which a request has been sent which are not + * present in the readyList. But such loads are not yet complete + * and thus occupy resources. We need to query the oldest in-flight + * node and since a map container keeps all its keys sorted using + * the less than criterion, the first element is the in-flight node + * with the least sequence number, i.e. the oldest in-flight node. + */ + std::map<NodeSeqNum, NodeRobNum> inFlightNodes; + + /** The ROB number of the oldest in-flight node */ + NodeRobNum oldestInFlightRobNum; + + /** Number of ready loads for which request may or may not be sent */ + uint16_t numInFlightLoads; + + /** Number of ready stores for which request may or may not be sent */ + uint16_t numInFlightStores; + }; + + /** + * The InputStream encapsulates a trace file and the + * internal buffers and populates GraphNodes based on + * the input. + */ + class InputStream + { + + private: + + /** Input file stream for the protobuf trace */ + ProtoInputStream trace; + + /** Count of committed ops read from trace plus the filtered ops */ + uint64_t microOpCount; + + /** + * The window size that is read from the header of the protobuf + * trace and used to process the dependency trace + */ + uint32_t windowSize; + public: + + /** + * Create a trace input stream for a given file name. + * + * @param filename Path to the file to read from + */ + InputStream(const std::string& filename); + + /** + * Reset the stream such that it can be played once + * again. + */ + void reset(); + + /** + * Attempt to read a trace element from the stream, + * and also notify the caller if the end of the file + * was reached. + * + * @param element Trace element to populate + * @param size of register dependency array stored in the element + * @return True if an element could be read successfully + */ + bool read(GraphNode* element); + + /** Get window size from trace */ + uint32_t getWindowSize() const { return windowSize; } + + /** Get number of micro-ops modelled in the TraceCPU replay */ + uint64_t getMicroOpCount() const { return microOpCount; } + }; + + public: + /* Constructor */ + ElasticDataGen(TraceCPU& _owner, const std::string& _name, + MasterPort& _port, MasterID master_id, + const std::string& trace_file, uint16_t max_rob, + uint16_t max_stores, uint16_t max_loads) + : owner(_owner), + port(_port), + masterID(master_id), + trace(trace_file), + genName(owner.name() + ".elastic" + _name), + retryPkt(nullptr), + traceComplete(false), + nextRead(false), + execComplete(false), + windowSize(trace.getWindowSize()), + hwResource(max_rob, max_stores, max_loads) + { + DPRINTF(TraceCPUData, "Window size in the trace is %d.\n", + windowSize); + } + + /** + * Called from TraceCPU init(). Reads the first message from the + * input trace file and returns the send tick. + * + * @return Tick when first packet must be sent + */ + Tick init(); + + /** Returns name of the ElasticDataGen instance. */ + const std::string& name() const { return genName; } + + /** Exit the ElasticDataGen. */ + void exit(); + + /** + * Reads a line of the trace file. Returns the tick when the next + * request should be generated. If the end of the file has been + * reached, it returns false. + * + * @return bool false if end of file has been reached else true + */ + bool readNextWindow(); + + /** + * Iterate over the dependencies of a new node and add the new node + * to the list of dependents of the parent node. + * + * @param new_node new node to add to the graph + * @tparam dep_array the dependency array of type rob or register, + * that is to be iterated, and may get modified + * @param num_dep the number of dependencies set in the array + * which may get modified during iteration + */ + template<typename T> void addDepsOnParent(GraphNode *new_node, + T& dep_array, + uint8_t& num_dep); + + /** + * This is the main execute function which consumes nodes from the + * sorted readyList. First attempt to issue the pending dependency-free + * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into + * the readyList. Then iterate through the readyList and when a node + * has its execute tick equal to curTick(), execute it. If the node is + * a load or a store call executeMemReq() and if it is neither, simply + * mark it complete. + */ + void execute(); + + /** + * Creates a new request for a load or store assigning the request + * parameters. Calls the port's sendTimingReq() and returns a packet + * if the send failed so that it can be saved for a retry. + * + * @param node_ptr pointer to the load or store node to be executed + * + * @return packet pointer if the request failed and nullptr if it was + * sent successfully + */ + PacketPtr executeMemReq(GraphNode* node_ptr); + + /** + * Add a ready node to the readyList. When inserting, ensure the nodes + * are sorted in ascending order of their execute ticks. + * + * @param seq_num seq. num of ready node + * @param exec_tick the execute tick of the ready node + */ + void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick); + + /** Print readyList for debugging using debug flag TraceCPUData. */ + void printReadyList(); + + /** + * When a load writeback is received, that is when the load completes, + * release the dependents on it. This is called from the dcache port + * recvTimingResp(). + */ + void completeMemAccess(PacketPtr pkt); + + /** + * Returns the execComplete variable which is set when the last + * node is executed. + * + * @return bool true if execComplete is set, false otherwise. + */ + bool isExecComplete() const { return execComplete; } + + /** + * Attempts to issue a node once the node's source dependencies are + * complete. If resources are available then add it to the readyList, + * otherwise the node is not issued and is stored in depFreeQueue + * until resources become available. + * + * @param node_ptr pointer to node to be issued + * @param first true if this is the first attempt to issue this node + * @return true if node was added to readyList + */ + bool checkAndIssue(const GraphNode* node_ptr, bool first = true); + + /** Get number of micro-ops modelled in the TraceCPU replay */ + uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); } + + void regStats(); + + private: + + /** Reference of the TraceCPU. */ + TraceCPU& owner; + + /** Reference of the port to be used to issue memory requests. */ + MasterPort& port; + + /** MasterID used for the requests being sent. */ + const MasterID masterID; + + /** Input stream used for reading the input trace file. */ + InputStream trace; + + /** String to store the name of the FixedRetryGen. */ + std::string genName; + + /** PacketPtr used to store the packet to retry. */ + PacketPtr retryPkt; + + /** Set to true when end of trace is reached. */ + bool traceComplete; + + /** Set to true when the next window of instructions need to be read */ + bool nextRead; + + /** Set true when execution of trace is complete */ + bool execComplete; + + /** + * Window size within which to check for dependencies. Its value is + * made equal to the window size used to generate the trace which is + * recorded in the trace header. The dependency graph must be + * populated enough such that when a node completes, its potential + * child node must be found and the dependency removed before the + * completed node itself is removed. Thus as soon as the graph shrinks + * to become smaller than this window, we read in the next window. + */ + const uint32_t windowSize; + + /** + * Hardware resources required to contain in-flight nodes and to + * throttle issuing of new nodes when resources are not available. + */ + HardwareResource hwResource; + + /** Store the depGraph of GraphNodes */ + std::unordered_map<NodeSeqNum, GraphNode*> depGraph; + + /** + * Queue of dependency-free nodes that are pending issue because + * resources are not available. This is chosen to be FIFO so that + * dependent nodes which become free in program order get pushed + * into the queue in that order. Thus nodes are more likely to + * issue in program order. + */ + std::queue<const GraphNode*> depFreeQueue; + + /** List of nodes that are ready to execute */ + std::list<ReadyNode> readyList; + + /** Stats for data memory accesses replayed. */ + Stats::Scalar maxDependents; + Stats::Scalar maxReadyListSize; + Stats::Scalar numSendAttempted; + Stats::Scalar numSendSucceeded; + Stats::Scalar numSendFailed; + Stats::Scalar numRetrySucceeded; + Stats::Scalar numSplitReqs; + Stats::Scalar numSOLoads; + Stats::Scalar numSOStores; + /** Tick when ElasticDataGen completes execution */ + Stats::Scalar dataLastTick; + }; + + /** Instance of FixedRetryGen to replay instruction read requests. */ + FixedRetryGen icacheGen; + + /** Instance of ElasticDataGen to replay data read and write requests. */ + ElasticDataGen dcacheGen; + + /** + * This is the control flow that uses the functionality of the icacheGen to + * replay the trace. It calls tryNext(). If it returns true then next event + * is scheduled at curTick() plus delta. If it returns false then delta is + * ignored and control is brought back via recvRetry(). + */ + void schedIcacheNext(); + + /** + * This is the control flow that uses the functionality of the dcacheGen to + * replay the trace. It calls execute(). It checks if execution is complete + * and schedules an event to exit simulation accordingly. + */ + void schedDcacheNext(); + + /** Event for the control flow method schedIcacheNext() */ + EventWrapper<TraceCPU, &TraceCPU::schedIcacheNext> icacheNextEvent; + + /** Event for the control flow method schedDcacheNext() */ + EventWrapper<TraceCPU, &TraceCPU::schedDcacheNext> dcacheNextEvent; + + /** This is called when either generator finishes executing from the trace */ + void checkAndSchedExitEvent(); + + /** Set to true when one of the generators finishes replaying its trace. */ + bool oneTraceComplete; + + /** + * This is stores the tick of the first instruction fetch request + * which is later used for dumping the tickOffset stat. + */ + Tick firstFetchTick; + + /** + * Number of Trace CPUs in the system used as a shared variable and passed + * to the CountedExitEvent event used for counting down exit events. It is + * incremented in the constructor call so that the total is arrived at + * automatically. + */ + static int numTraceCPUs; + + /** + * A CountedExitEvent which when serviced decrements the counter. A sim + * exit event is scheduled when the counter equals zero, that is all + * instances of Trace CPU have had their execCompleteEvent serviced. + */ + CountedExitEvent *execCompleteEvent; + + Stats::Scalar numSchedDcacheEvent; + Stats::Scalar numSchedIcacheEvent; + + /** Stat for number of simulated micro-ops. */ + Stats::Scalar numOps; + /** Stat for the CPI. This is really cycles per micro-op and not inst. */ + Stats::Formula cpi; + + /** + * The first execution tick is dumped as a stat so that the simulated + * seconds for a trace replay can be calculated as a difference between the + * final_tick stat and the tickOffset stat + */ + Stats::Scalar tickOffset; + + public: + + /** Used to get a reference to the icache port. */ + MasterPort &getInstPort() { return icachePort; } + + /** Used to get a reference to the dcache port. */ + MasterPort &getDataPort() { return dcachePort; } + + void regStats(); +}; +#endif // __CPU_TRACE_TRACE_CPU_HH__ |