/* * Copyright (c) 2013 - 2016 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall * not be construed as granting a license to any other intellectual * property including but not limited to intellectual property relating * to a hardware implementation of the functionality of the software * licensed hereunder. You may use the software subject to the license * terms below provided that you ensure that this notice is replicated * unmodified and in its entirety in all distributions of the software, * modified or unmodified, in source code or in binary form. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Radhika Jagtap * Andreas Hansson * Thomas Grass */ #include "cpu/trace/trace_cpu.hh" #include "sim/sim_exit.hh" // Declare and initialize the static counter for number of trace CPUs. int TraceCPU::numTraceCPUs = 0; TraceCPU::TraceCPU(TraceCPUParams *params) : BaseCPU(params), icachePort(this), dcachePort(this), instMasterID(params->system->getMasterId(this, "inst")), dataMasterID(params->system->getMasterId(this, "data")), instTraceFile(params->instTraceFile), dataTraceFile(params->dataTraceFile), icacheGen(*this, ".iside", icachePort, instMasterID, instTraceFile), dcacheGen(*this, ".dside", dcachePort, dataMasterID, dataTraceFile, params), icacheNextEvent([this]{ schedIcacheNext(); }, name()), dcacheNextEvent([this]{ schedDcacheNext(); }, name()), oneTraceComplete(false), traceOffset(0), execCompleteEvent(nullptr), enableEarlyExit(params->enableEarlyExit), progressMsgInterval(params->progressMsgInterval), progressMsgThreshold(params->progressMsgInterval) { // Increment static counter for number of Trace CPUs. ++TraceCPU::numTraceCPUs; // Check that the python parameters for sizes of ROB, store buffer and // load buffer do not overflow the corresponding C++ variables. fatal_if(params->sizeROB > UINT16_MAX, "ROB size set to %d exceeds the " "max. value of %d.\n", params->sizeROB, UINT16_MAX); fatal_if(params->sizeStoreBuffer > UINT16_MAX, "ROB size set to %d " "exceeds the max. value of %d.\n", params->sizeROB, UINT16_MAX); fatal_if(params->sizeLoadBuffer > UINT16_MAX, "Load buffer size set to" " %d exceeds the max. value of %d.\n", params->sizeLoadBuffer, UINT16_MAX); } TraceCPU::~TraceCPU() { } TraceCPU* TraceCPUParams::create() { return new TraceCPU(this); } void TraceCPU::updateNumOps(uint64_t rob_num) { numOps = rob_num; if (progressMsgInterval != 0 && numOps.value() >= progressMsgThreshold) { inform("%s: %i insts committed\n", name(), progressMsgThreshold); progressMsgThreshold += progressMsgInterval; } } void TraceCPU::takeOverFrom(BaseCPU *oldCPU) { // Unbind the ports of the old CPU and bind the ports of the TraceCPU. assert(!getInstPort().isConnected()); assert(oldCPU->getInstPort().isConnected()); BaseSlavePort &inst_peer_port = oldCPU->getInstPort().getSlavePort(); oldCPU->getInstPort().unbind(); getInstPort().bind(inst_peer_port); assert(!getDataPort().isConnected()); assert(oldCPU->getDataPort().isConnected()); BaseSlavePort &data_peer_port = oldCPU->getDataPort().getSlavePort(); oldCPU->getDataPort().unbind(); getDataPort().bind(data_peer_port); } void TraceCPU::init() { DPRINTF(TraceCPUInst, "Instruction fetch request trace file is \"%s\"." "\n", instTraceFile); DPRINTF(TraceCPUData, "Data memory request trace file is \"%s\".\n", dataTraceFile); BaseCPU::init(); // Get the send tick of the first instruction read request Tick first_icache_tick = icacheGen.init(); // Get the send tick of the first data read/write request Tick first_dcache_tick = dcacheGen.init(); // Set the trace offset as the minimum of that in both traces traceOffset = std::min(first_icache_tick, first_dcache_tick); inform("%s: Time offset (tick) found as min of both traces is %lli.\n", name(), traceOffset); // Schedule next icache and dcache event by subtracting the offset schedule(icacheNextEvent, first_icache_tick - traceOffset); schedule(dcacheNextEvent, first_dcache_tick - traceOffset); // Adjust the trace offset for the dcache generator's ready nodes // We don't need to do this for the icache generator as it will // send its first request at the first event and schedule subsequent // events using a relative tick delta dcacheGen.adjustInitTraceOffset(traceOffset); // If the Trace CPU simulation is configured to exit on any one trace // completion then we don't need a counted event to count down all Trace // CPUs in the system. If not then instantiate a counted event. if (!enableEarlyExit) { // The static counter for number of Trace CPUs is correctly set at // this point so create an event and pass it. execCompleteEvent = new CountedExitEvent("end of all traces reached.", numTraceCPUs); } } void TraceCPU::schedIcacheNext() { DPRINTF(TraceCPUInst, "IcacheGen event.\n"); // Try to send the current packet or a retry packet if there is one bool sched_next = icacheGen.tryNext(); // If packet sent successfully, schedule next event if (sched_next) { DPRINTF(TraceCPUInst, "Scheduling next icacheGen event " "at %d.\n", curTick() + icacheGen.tickDelta()); schedule(icacheNextEvent, curTick() + icacheGen.tickDelta()); ++numSchedIcacheEvent; } else { // check if traceComplete. If not, do nothing because sending failed // and next event will be scheduled via RecvRetry() if (icacheGen.isTraceComplete()) { // If this is the first trace to complete, set the variable. If it // is already set then both traces are complete to exit sim. checkAndSchedExitEvent(); } } return; } void TraceCPU::schedDcacheNext() { DPRINTF(TraceCPUData, "DcacheGen event.\n"); // Update stat for numCycles numCycles = clockEdge() / clockPeriod(); dcacheGen.execute(); if (dcacheGen.isExecComplete()) { checkAndSchedExitEvent(); } } void TraceCPU::checkAndSchedExitEvent() { if (!oneTraceComplete) { oneTraceComplete = true; } else { // Schedule event to indicate execution is complete as both // instruction and data access traces have been played back. inform("%s: Execution complete.\n", name()); // If the replay is configured to exit early, that is when any one // execution is complete then exit immediately and return. Otherwise, // schedule the counted exit that counts down completion of each Trace // CPU. if (enableEarlyExit) { exitSimLoop("End of trace reached"); } else { schedule(*execCompleteEvent, curTick()); } } } void TraceCPU::regStats() { BaseCPU::regStats(); numSchedDcacheEvent .name(name() + ".numSchedDcacheEvent") .desc("Number of events scheduled to trigger data request generator") ; numSchedIcacheEvent .name(name() + ".numSchedIcacheEvent") .desc("Number of events scheduled to trigger instruction request generator") ; numOps .name(name() + ".numOps") .desc("Number of micro-ops simulated by the Trace CPU") ; cpi .name(name() + ".cpi") .desc("Cycles per micro-op used as a proxy for CPI") .precision(6) ; cpi = numCycles/numOps; icacheGen.regStats(); dcacheGen.regStats(); } void TraceCPU::ElasticDataGen::regStats() { using namespace Stats; maxDependents .name(name() + ".maxDependents") .desc("Max number of dependents observed on a node") ; maxReadyListSize .name(name() + ".maxReadyListSize") .desc("Max size of the ready list observed") ; numSendAttempted .name(name() + ".numSendAttempted") .desc("Number of first attempts to send a request") ; numSendSucceeded .name(name() + ".numSendSucceeded") .desc("Number of successful first attempts") ; numSendFailed .name(name() + ".numSendFailed") .desc("Number of failed first attempts") ; numRetrySucceeded .name(name() + ".numRetrySucceeded") .desc("Number of successful retries") ; numSplitReqs .name(name() + ".numSplitReqs") .desc("Number of split requests") ; numSOLoads .name(name() + ".numSOLoads") .desc("Number of strictly ordered loads") ; numSOStores .name(name() + ".numSOStores") .desc("Number of strictly ordered stores") ; dataLastTick .name(name() + ".dataLastTick") .desc("Last tick simulated from the elastic data trace") ; } Tick TraceCPU::ElasticDataGen::init() { DPRINTF(TraceCPUData, "Initializing data memory request generator " "DcacheGen: elastic issue with retry.\n"); if (!readNextWindow()) panic("Trace has %d elements. It must have at least %d elements.\n", depGraph.size(), 2 * windowSize); DPRINTF(TraceCPUData, "After 1st read, depGraph size:%d.\n", depGraph.size()); if (!readNextWindow()) panic("Trace has %d elements. It must have at least %d elements.\n", depGraph.size(), 2 * windowSize); DPRINTF(TraceCPUData, "After 2st read, depGraph size:%d.\n", depGraph.size()); // Print readyList if (DTRACE(TraceCPUData)) { printReadyList(); } auto free_itr = readyList.begin(); DPRINTF(TraceCPUData, "Execute tick of the first dependency free node %lli" " is %d.\n", free_itr->seqNum, free_itr->execTick); // Return the execute tick of the earliest ready node so that an event // can be scheduled to call execute() return (free_itr->execTick); } void TraceCPU::ElasticDataGen::adjustInitTraceOffset(Tick& offset) { for (auto& free_node : readyList) { free_node.execTick -= offset; } } void TraceCPU::ElasticDataGen::exit() { trace.reset(); } bool TraceCPU::ElasticDataGen::readNextWindow() { // Read and add next window DPRINTF(TraceCPUData, "Reading next window from file.\n"); if (traceComplete) { // We are at the end of the file, thus we have no more records. // Return false. return false; } DPRINTF(TraceCPUData, "Start read: Size of depGraph is %d.\n", depGraph.size()); uint32_t num_read = 0; while (num_read != windowSize) { // Create a new graph node GraphNode* new_node = new GraphNode; // Read the next line to get the next record. If that fails then end of // trace has been reached and traceComplete needs to be set in addition // to returning false. if (!trace.read(new_node)) { DPRINTF(TraceCPUData, "\tTrace complete!\n"); traceComplete = true; return false; } // Annotate the ROB dependencies of the new node onto the parent nodes. addDepsOnParent(new_node, new_node->robDep, new_node->numRobDep); // Annotate the register dependencies of the new node onto the parent // nodes. addDepsOnParent(new_node, new_node->regDep, new_node->numRegDep); num_read++; // Add to map depGraph[new_node->seqNum] = new_node; if (new_node->numRobDep == 0 && new_node->numRegDep == 0) { // Source dependencies are already complete, check if resources // are available and issue. The execution time is approximated // to current time plus the computational delay. checkAndIssue(new_node); } } DPRINTF(TraceCPUData, "End read: Size of depGraph is %d.\n", depGraph.size()); return true; } template void TraceCPU::ElasticDataGen::addDepsOnParent(GraphNode *new_node, T& dep_array, uint8_t& num_dep) { for (auto& a_dep : dep_array) { // The convention is to set the dependencies starting with the first // index in the ROB and register dependency arrays. Thus, when we reach // a dependency equal to the initialisation value of zero, we know have // iterated over all dependencies and can break. if (a_dep == 0) break; // We look up the valid dependency, i.e. the parent of this node auto parent_itr = depGraph.find(a_dep); if (parent_itr != depGraph.end()) { // If the parent is found, it is yet to be executed. Append a // pointer to the new node to the dependents list of the parent // node. parent_itr->second->dependents.push_back(new_node); auto num_depts = parent_itr->second->dependents.size(); maxDependents = std::max(num_depts, maxDependents.value()); } else { // The dependency is not found in the graph. So consider // the execution of the parent is complete, i.e. remove this // dependency. a_dep = 0; num_dep--; } } } void TraceCPU::ElasticDataGen::execute() { DPRINTF(TraceCPUData, "Execute start occupancy:\n"); DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, " "depFreeQueue = %d ,", depGraph.size(), readyList.size(), depFreeQueue.size()); hwResource.printOccupancy(); // Read next window to make sure that dependents of all dep-free nodes // are in the depGraph if (nextRead) { readNextWindow(); nextRead = false; } // First attempt to issue the pending dependency-free nodes held // in depFreeQueue. If resources have become available for a node, // then issue it, i.e. add the node to readyList. while (!depFreeQueue.empty()) { if (checkAndIssue(depFreeQueue.front(), false)) { DPRINTF(TraceCPUData, "Removing from depFreeQueue: seq. num " "%lli.\n", (depFreeQueue.front())->seqNum); depFreeQueue.pop(); } else { break; } } // Proceed to execute from readyList auto graph_itr = depGraph.begin(); auto free_itr = readyList.begin(); // Iterate through readyList until the next free node has its execute // tick later than curTick or the end of readyList is reached while (free_itr->execTick <= curTick() && free_itr != readyList.end()) { // Get pointer to the node to be executed graph_itr = depGraph.find(free_itr->seqNum); assert(graph_itr != depGraph.end()); GraphNode* node_ptr = graph_itr->second; // If there is a retryPkt send that else execute the load if (retryPkt) { // The retryPkt must be the request that was created by the // first node in the readyList. if (retryPkt->req->getReqInstSeqNum() != node_ptr->seqNum) { panic("Retry packet's seqence number does not match " "the first node in the readyList.\n"); } if (port.sendTimingReq(retryPkt)) { ++numRetrySucceeded; retryPkt = nullptr; } } else if (node_ptr->isLoad() || node_ptr->isStore()) { // If there is no retryPkt, attempt to send a memory request in // case of a load or store node. If the send fails, executeMemReq() // returns a packet pointer, which we save in retryPkt. In case of // a comp node we don't do anything and simply continue as if the // execution of the comp node succedded. retryPkt = executeMemReq(node_ptr); } // If the retryPkt or a new load/store node failed, we exit from here // as a retry from cache will bring the control to execute(). The // first node in readyList then, will be the failed node. if (retryPkt) { break; } // Proceed to remove dependencies for the successfully executed node. // If it is a load which is not strictly ordered and we sent a // request for it successfully, we do not yet mark any register // dependencies complete. But as per dependency modelling we need // to mark ROB dependencies of load and non load/store nodes which // are based on successful sending of the load as complete. if (node_ptr->isLoad() && !node_ptr->isStrictlyOrdered()) { // If execute succeeded mark its dependents as complete DPRINTF(TraceCPUData, "Node seq. num %lli sent. Waking up " "dependents..\n", node_ptr->seqNum); auto child_itr = (node_ptr->dependents).begin(); while (child_itr != (node_ptr->dependents).end()) { // ROB dependency of a store on a load must not be removed // after load is sent but after response is received if (!(*child_itr)->isStore() && (*child_itr)->removeRobDep(node_ptr->seqNum)) { // Check if the child node has become dependency free if ((*child_itr)->numRobDep == 0 && (*child_itr)->numRegDep == 0) { // Source dependencies are complete, check if // resources are available and issue checkAndIssue(*child_itr); } // Remove this child for the sent load and point to new // location of the element following the erased element child_itr = node_ptr->dependents.erase(child_itr); } else { // This child is not dependency-free, point to the next // child child_itr++; } } } else { // If it is a strictly ordered load mark its dependents as complete // as we do not send a request for this case. If it is a store or a // comp node we also mark all its dependents complete. DPRINTF(TraceCPUData, "Node seq. num %lli done. Waking" " up dependents..\n", node_ptr->seqNum); for (auto child : node_ptr->dependents) { // If the child node is dependency free removeDepOnInst() // returns true. if (child->removeDepOnInst(node_ptr->seqNum)) { // Source dependencies are complete, check if resources // are available and issue checkAndIssue(child); } } } // After executing the node, remove from readyList and delete node. readyList.erase(free_itr); // If it is a cacheable load which was sent, don't delete // just yet. Delete it in completeMemAccess() after the // response is received. If it is an strictly ordered // load, it was not sent and all dependencies were simply // marked complete. Thus it is safe to delete it. For // stores and non load/store nodes all dependencies were // marked complete so it is safe to delete it. if (!node_ptr->isLoad() || node_ptr->isStrictlyOrdered()) { // Release all resources occupied by the completed node hwResource.release(node_ptr); // clear the dynamically allocated set of dependents (node_ptr->dependents).clear(); // Update the stat for numOps simulated owner.updateNumOps(node_ptr->robNum); // delete node delete node_ptr; // remove from graph depGraph.erase(graph_itr); } // Point to first node to continue to next iteration of while loop free_itr = readyList.begin(); } // end of while loop // Print readyList, sizes of queues and resource status after updating if (DTRACE(TraceCPUData)) { printReadyList(); DPRINTF(TraceCPUData, "Execute end occupancy:\n"); DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, " "depFreeQueue = %d ,", depGraph.size(), readyList.size(), depFreeQueue.size()); hwResource.printOccupancy(); } if (retryPkt) { DPRINTF(TraceCPUData, "Not scheduling an event as expecting a retry" "event from the cache for seq. num %lli.\n", retryPkt->req->getReqInstSeqNum()); return; } // If the size of the dependency graph is less than the dependency window // then read from the trace file to populate the graph next time we are in // execute. if (depGraph.size() < windowSize && !traceComplete) nextRead = true; // If cache is not blocked, schedule an event for the first execTick in // readyList else retry from cache will schedule the event. If the ready // list is empty then check if the next pending node has resources // available to issue. If yes, then schedule an event for the next cycle. if (!readyList.empty()) { Tick next_event_tick = std::max(readyList.begin()->execTick, curTick()); DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", next_event_tick); owner.schedDcacheNextEvent(next_event_tick); } else if (readyList.empty() && !depFreeQueue.empty() && hwResource.isAvailable(depFreeQueue.front())) { DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", owner.clockEdge(Cycles(1))); owner.schedDcacheNextEvent(owner.clockEdge(Cycles(1))); } // If trace is completely read, readyList is empty and depGraph is empty, // set execComplete to true if (depGraph.empty() && readyList.empty() && traceComplete && !hwResource.awaitingResponse()) { DPRINTF(TraceCPUData, "\tExecution Complete!\n"); execComplete = true; dataLastTick = curTick(); } } PacketPtr TraceCPU::ElasticDataGen::executeMemReq(GraphNode* node_ptr) { DPRINTF(TraceCPUData, "Executing memory request %lli (phys addr %d, " "virt addr %d, pc %#x, size %d, flags %d).\n", node_ptr->seqNum, node_ptr->physAddr, node_ptr->virtAddr, node_ptr->pc, node_ptr->size, node_ptr->flags); // If the request is strictly ordered, do not send it. Just return nullptr // as if it was succesfully sent. if (node_ptr->isStrictlyOrdered()) { node_ptr->isLoad() ? ++numSOLoads : ++numSOStores; DPRINTF(TraceCPUData, "Skipping strictly ordered request %lli.\n", node_ptr->seqNum); return nullptr; } // Check if the request spans two cache lines as this condition triggers // an assert fail in the L1 cache. If it does then truncate the size to // access only until the end of that line and ignore the remainder. The // stat counting this is useful to keep a check on how frequently this // happens. If required the code could be revised to mimick splitting such // a request into two. unsigned blk_size = owner.cacheLineSize(); Addr blk_offset = (node_ptr->physAddr & (Addr)(blk_size - 1)); if (!(blk_offset + node_ptr->size <= blk_size)) { node_ptr->size = blk_size - blk_offset; ++numSplitReqs; } // Create a request and the packet containing request auto req = std::make_shared( node_ptr->physAddr, node_ptr->size, node_ptr->flags, masterID, node_ptr->seqNum, ContextID(0)); req->setPC(node_ptr->pc); // If virtual address is valid, set the asid and virtual address fields // of the request. if (node_ptr->virtAddr != 0) { req->setVirt(node_ptr->asid, node_ptr->virtAddr, node_ptr->size, node_ptr->flags, masterID, node_ptr->pc); req->setPaddr(node_ptr->physAddr); req->setReqInstSeqNum(node_ptr->seqNum); } PacketPtr pkt; uint8_t* pkt_data = new uint8_t[req->getSize()]; if (node_ptr->isLoad()) { pkt = Packet::createRead(req); } else { pkt = Packet::createWrite(req); memset(pkt_data, 0xA, req->getSize()); } pkt->dataDynamic(pkt_data); // Call MasterPort method to send a timing request for this packet bool success = port.sendTimingReq(pkt); ++numSendAttempted; if (!success) { // If it fails, return the packet to retry when a retry is signalled by // the cache ++numSendFailed; DPRINTF(TraceCPUData, "Send failed. Saving packet for retry.\n"); return pkt; } else { // It is succeeds, return nullptr ++numSendSucceeded; return nullptr; } } bool TraceCPU::ElasticDataGen::checkAndIssue(const GraphNode* node_ptr, bool first) { // Assert the node is dependency-free assert(node_ptr->numRobDep == 0 && node_ptr->numRegDep == 0); // If this is the first attempt, print a debug message to indicate this. if (first) { DPRINTFR(TraceCPUData, "\t\tseq. num %lli(%s) with rob num %lli is now" " dependency free.\n", node_ptr->seqNum, node_ptr->typeToStr(), node_ptr->robNum); } // Check if resources are available to issue the specific node if (hwResource.isAvailable(node_ptr)) { // If resources are free only then add to readyList DPRINTFR(TraceCPUData, "\t\tResources available for seq. num %lli. Adding" " to readyList, occupying resources.\n", node_ptr->seqNum); // Compute the execute tick by adding the compute delay for the node // and add the ready node to the ready list addToSortedReadyList(node_ptr->seqNum, owner.clockEdge() + node_ptr->compDelay); // Account for the resources taken up by this issued node. hwResource.occupy(node_ptr); return true; } else { if (first) { // Although dependencies are complete, resources are not available. DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli." " Adding to depFreeQueue.\n", node_ptr->seqNum); depFreeQueue.push(node_ptr); } else { DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli. " "Still pending issue.\n", node_ptr->seqNum); } return false; } } void TraceCPU::ElasticDataGen::completeMemAccess(PacketPtr pkt) { // Release the resources for this completed node. if (pkt->isWrite()) { // Consider store complete. hwResource.releaseStoreBuffer(); // If it is a store response then do nothing since we do not model // dependencies on store completion in the trace. But if we were // blocking execution due to store buffer fullness, we need to schedule // an event and attempt to progress. } else { // If it is a load response then release the dependents waiting on it. // Get pointer to the completed load auto graph_itr = depGraph.find(pkt->req->getReqInstSeqNum()); assert(graph_itr != depGraph.end()); GraphNode* node_ptr = graph_itr->second; // Release resources occupied by the load hwResource.release(node_ptr); DPRINTF(TraceCPUData, "Load seq. num %lli response received. Waking up" " dependents..\n", node_ptr->seqNum); for (auto child : node_ptr->dependents) { if (child->removeDepOnInst(node_ptr->seqNum)) { checkAndIssue(child); } } // clear the dynamically allocated set of dependents (node_ptr->dependents).clear(); // Update the stat for numOps completed owner.updateNumOps(node_ptr->robNum); // delete node delete node_ptr; // remove from graph depGraph.erase(graph_itr); } if (DTRACE(TraceCPUData)) { printReadyList(); } // If the size of the dependency graph is less than the dependency window // then read from the trace file to populate the graph next time we are in // execute. if (depGraph.size() < windowSize && !traceComplete) nextRead = true; // If not waiting for retry, attempt to schedule next event if (!retryPkt) { // We might have new dep-free nodes in the list which will have execute // tick greater than or equal to curTick. But a new dep-free node might // have its execute tick earlier. Therefore, attempt to reschedule. It // could happen that the readyList is empty and we got here via a // last remaining response. So, either the trace is complete or there // are pending nodes in the depFreeQueue. The checking is done in the // execute() control flow, so schedule an event to go via that flow. Tick next_event_tick = readyList.empty() ? owner.clockEdge(Cycles(1)) : std::max(readyList.begin()->execTick, owner.clockEdge(Cycles(1))); DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n", next_event_tick); owner.schedDcacheNextEvent(next_event_tick); } } void TraceCPU::ElasticDataGen::addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick) { ReadyNode ready_node; ready_node.seqNum = seq_num; ready_node.execTick = exec_tick; // Iterator to readyList auto itr = readyList.begin(); // If the readyList is empty, simply insert the new node at the beginning // and return if (itr == readyList.end()) { readyList.insert(itr, ready_node); maxReadyListSize = std::max(readyList.size(), maxReadyListSize.value()); return; } // If the new node has its execution tick equal to the first node in the // list then go to the next node. If the first node in the list failed // to execute, its position as the first is thus maintained. if (retryPkt) if (retryPkt->req->getReqInstSeqNum() == itr->seqNum) itr++; // Increment the iterator and compare the node pointed to by it to the new // node till the position to insert the new node is found. bool found = false; while (!found && itr != readyList.end()) { // If the execution tick of the new node is less than the node then // this is the position to insert if (exec_tick < itr->execTick) found = true; // If the execution tick of the new node is equal to the node then // sort in ascending order of sequence numbers else if (exec_tick == itr->execTick) { // If the sequence number of the new node is less than the node // then this is the position to insert if (seq_num < itr->seqNum) found = true; // Else go to next node else itr++; } // If the execution tick of the new node is greater than the node then // go to the next node else itr++; } readyList.insert(itr, ready_node); // Update the stat for max size reached of the readyList maxReadyListSize = std::max(readyList.size(), maxReadyListSize.value()); } void TraceCPU::ElasticDataGen::printReadyList() { auto itr = readyList.begin(); if (itr == readyList.end()) { DPRINTF(TraceCPUData, "readyList is empty.\n"); return; } DPRINTF(TraceCPUData, "Printing readyList:\n"); while (itr != readyList.end()) { auto graph_itr = depGraph.find(itr->seqNum); GraphNode* node_ptr M5_VAR_USED = graph_itr->second; DPRINTFR(TraceCPUData, "\t%lld(%s), %lld\n", itr->seqNum, node_ptr->typeToStr(), itr->execTick); itr++; } } TraceCPU::ElasticDataGen::HardwareResource::HardwareResource( uint16_t max_rob, uint16_t max_stores, uint16_t max_loads) : sizeROB(max_rob), sizeStoreBuffer(max_stores), sizeLoadBuffer(max_loads), oldestInFlightRobNum(UINT64_MAX), numInFlightLoads(0), numInFlightStores(0) {} void TraceCPU::ElasticDataGen::HardwareResource::occupy(const GraphNode* new_node) { // Occupy ROB entry for the issued node // Merely maintain the oldest node, i.e. numerically least robNum by saving // it in the variable oldestInFLightRobNum. inFlightNodes[new_node->seqNum] = new_node->robNum; oldestInFlightRobNum = inFlightNodes.begin()->second; // Occupy Load/Store Buffer entry for the issued node if applicable if (new_node->isLoad()) { ++numInFlightLoads; } else if (new_node->isStore()) { ++numInFlightStores; } // else if it is a non load/store node, no buffer entry is occupied printOccupancy(); } void TraceCPU::ElasticDataGen::HardwareResource::release(const GraphNode* done_node) { assert(!inFlightNodes.empty()); DPRINTFR(TraceCPUData, "\tClearing done seq. num %d from inFlightNodes..\n", done_node->seqNum); assert(inFlightNodes.find(done_node->seqNum) != inFlightNodes.end()); inFlightNodes.erase(done_node->seqNum); if (inFlightNodes.empty()) { // If we delete the only in-flight node and then the // oldestInFlightRobNum is set to it's initialized (max) value. oldestInFlightRobNum = UINT64_MAX; } else { // Set the oldest in-flight node rob number equal to the first node in // the inFlightNodes since that will have the numerically least value. oldestInFlightRobNum = inFlightNodes.begin()->second; } DPRINTFR(TraceCPUData, "\tCleared. inFlightNodes.size() = %d, " "oldestInFlightRobNum = %d\n", inFlightNodes.size(), oldestInFlightRobNum); // A store is considered complete when a request is sent, thus ROB entry is // freed. But it occupies an entry in the Store Buffer until its response // is received. A load is considered complete when a response is received, // thus both ROB and Load Buffer entries can be released. if (done_node->isLoad()) { assert(numInFlightLoads != 0); --numInFlightLoads; } // For normal writes, we send the requests out and clear a store buffer // entry on response. For writes which are strictly ordered, for e.g. // writes to device registers, we do that within release() which is called // when node is executed and taken off from readyList. if (done_node->isStore() && done_node->isStrictlyOrdered()) { releaseStoreBuffer(); } } void TraceCPU::ElasticDataGen::HardwareResource::releaseStoreBuffer() { assert(numInFlightStores != 0); --numInFlightStores; } bool TraceCPU::ElasticDataGen::HardwareResource::isAvailable( const GraphNode* new_node) const { uint16_t num_in_flight_nodes; if (inFlightNodes.empty()) { num_in_flight_nodes = 0; DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" " #in-flight nodes = 0", new_node->seqNum); } else if (new_node->robNum > oldestInFlightRobNum) { // This is the intuitive case where new dep-free node is younger // instruction than the oldest instruction in-flight. Thus we make sure // in_flight_nodes does not overflow. num_in_flight_nodes = new_node->robNum - oldestInFlightRobNum; DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" " #in-flight nodes = %d - %d = %d", new_node->seqNum, new_node->robNum, oldestInFlightRobNum, num_in_flight_nodes); } else { // This is the case where an instruction older than the oldest in- // flight instruction becomes dep-free. Thus we must have already // accounted for the entry in ROB for this new dep-free node. // Immediately after this check returns true, oldestInFlightRobNum will // be updated in occupy(). We simply let this node issue now. num_in_flight_nodes = 0; DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:" " new oldestInFlightRobNum = %d, #in-flight nodes ignored", new_node->seqNum, new_node->robNum); } DPRINTFR(TraceCPUData, ", LQ = %d/%d, SQ = %d/%d.\n", numInFlightLoads, sizeLoadBuffer, numInFlightStores, sizeStoreBuffer); // Check if resources are available to issue the specific node if (num_in_flight_nodes >= sizeROB) { return false; } if (new_node->isLoad() && numInFlightLoads >= sizeLoadBuffer) { return false; } if (new_node->isStore() && numInFlightStores >= sizeStoreBuffer) { return false; } return true; } bool TraceCPU::ElasticDataGen::HardwareResource::awaitingResponse() const { // Return true if there is at least one read or write request in flight return (numInFlightStores != 0 || numInFlightLoads != 0); } void TraceCPU::ElasticDataGen::HardwareResource::printOccupancy() { DPRINTFR(TraceCPUData, "oldestInFlightRobNum = %d, " "LQ = %d/%d, SQ = %d/%d.\n", oldestInFlightRobNum, numInFlightLoads, sizeLoadBuffer, numInFlightStores, sizeStoreBuffer); } void TraceCPU::FixedRetryGen::regStats() { using namespace Stats; numSendAttempted .name(name() + ".numSendAttempted") .desc("Number of first attempts to send a request") ; numSendSucceeded .name(name() + ".numSendSucceeded") .desc("Number of successful first attempts") ; numSendFailed .name(name() + ".numSendFailed") .desc("Number of failed first attempts") ; numRetrySucceeded .name(name() + ".numRetrySucceeded") .desc("Number of successful retries") ; instLastTick .name(name() + ".instLastTick") .desc("Last tick simulated from the fixed inst trace") ; } Tick TraceCPU::FixedRetryGen::init() { DPRINTF(TraceCPUInst, "Initializing instruction fetch request generator" " IcacheGen: fixed issue with retry.\n"); if (nextExecute()) { DPRINTF(TraceCPUInst, "\tFirst tick = %d.\n", currElement.tick); return currElement.tick; } else { panic("Read of first message in the trace failed.\n"); return MaxTick; } } bool TraceCPU::FixedRetryGen::tryNext() { // If there is a retry packet, try to send it if (retryPkt) { DPRINTF(TraceCPUInst, "Trying to send retry packet.\n"); if (!port.sendTimingReq(retryPkt)) { // Still blocked! This should never occur. DPRINTF(TraceCPUInst, "Retry packet sending failed.\n"); return false; } ++numRetrySucceeded; } else { DPRINTF(TraceCPUInst, "Trying to send packet for currElement.\n"); // try sending current element assert(currElement.isValid()); ++numSendAttempted; if (!send(currElement.addr, currElement.blocksize, currElement.cmd, currElement.flags, currElement.pc)) { DPRINTF(TraceCPUInst, "currElement sending failed.\n"); ++numSendFailed; // return false to indicate not to schedule next event return false; } else { ++numSendSucceeded; } } // If packet was sent successfully, either retryPkt or currElement, return // true to indicate to schedule event at current Tick plus delta. If packet // was sent successfully and there is no next packet to send, return false. DPRINTF(TraceCPUInst, "Packet sent successfully, trying to read next " "element.\n"); retryPkt = nullptr; // Read next element into currElement, currElement gets cleared so save the // tick to calculate delta Tick last_tick = currElement.tick; if (nextExecute()) { assert(currElement.tick >= last_tick); delta = currElement.tick - last_tick; } return !traceComplete; } void TraceCPU::FixedRetryGen::exit() { trace.reset(); } bool TraceCPU::FixedRetryGen::nextExecute() { if (traceComplete) // We are at the end of the file, thus we have no more messages. // Return false. return false; //Reset the currElement to the default values currElement.clear(); // Read the next line to get the next message. If that fails then end of // trace has been reached and traceComplete needs to be set in addition // to returning false. If successful then next message is in currElement. if (!trace.read(&currElement)) { traceComplete = true; instLastTick = curTick(); return false; } DPRINTF(TraceCPUInst, "inst fetch: %c addr %d pc %#x size %d tick %d\n", currElement.cmd.isRead() ? 'r' : 'w', currElement.addr, currElement.pc, currElement.blocksize, currElement.tick); return true; } bool TraceCPU::FixedRetryGen::send(Addr addr, unsigned size, const MemCmd& cmd, Request::FlagsType flags, Addr pc) { // Create new request auto req = std::make_shared(addr, size, flags, masterID); req->setPC(pc); // If this is not done it triggers assert in L1 cache for invalid contextId req->setContext(ContextID(0)); // Embed it in a packet PacketPtr pkt = new Packet(req, cmd); uint8_t* pkt_data = new uint8_t[req->getSize()]; pkt->dataDynamic(pkt_data); if (cmd.isWrite()) { memset(pkt_data, 0xA, req->getSize()); } // Call MasterPort method to send a timing request for this packet bool success = port.sendTimingReq(pkt); if (!success) { // If it fails, save the packet to retry when a retry is signalled by // the cache retryPkt = pkt; } return success; } void TraceCPU::icacheRetryRecvd() { // Schedule an event to go through the control flow in the same tick as // retry is received DPRINTF(TraceCPUInst, "Icache retry received. Scheduling next IcacheGen" " event @%lli.\n", curTick()); schedule(icacheNextEvent, curTick()); } void TraceCPU::dcacheRetryRecvd() { // Schedule an event to go through the execute flow in the same tick as // retry is received DPRINTF(TraceCPUData, "Dcache retry received. Scheduling next DcacheGen" " event @%lli.\n", curTick()); schedule(dcacheNextEvent, curTick()); } void TraceCPU::schedDcacheNextEvent(Tick when) { if (!dcacheNextEvent.scheduled()) { DPRINTF(TraceCPUData, "Scheduling next DcacheGen event at %lli.\n", when); schedule(dcacheNextEvent, when); ++numSchedDcacheEvent; } else if (when < dcacheNextEvent.when()) { DPRINTF(TraceCPUData, "Re-scheduling next dcache event from %lli" " to %lli.\n", dcacheNextEvent.when(), when); reschedule(dcacheNextEvent, when); } } bool TraceCPU::IcachePort::recvTimingResp(PacketPtr pkt) { // All responses on the instruction fetch side are ignored. Simply delete // the packet to free allocated memory delete pkt; return true; } void TraceCPU::IcachePort::recvReqRetry() { owner->icacheRetryRecvd(); } void TraceCPU::dcacheRecvTimingResp(PacketPtr pkt) { DPRINTF(TraceCPUData, "Received timing response from Dcache.\n"); dcacheGen.completeMemAccess(pkt); } bool TraceCPU::DcachePort::recvTimingResp(PacketPtr pkt) { // Handle the responses for data memory requests which is done inside the // elastic data generator owner->dcacheRecvTimingResp(pkt); // After processing the response delete the packet to free // memory delete pkt; return true; } void TraceCPU::DcachePort::recvReqRetry() { owner->dcacheRetryRecvd(); } TraceCPU::ElasticDataGen::InputStream::InputStream( const std::string& filename, const double time_multiplier) : trace(filename), timeMultiplier(time_multiplier), microOpCount(0) { // Create a protobuf message for the header and read it from the stream ProtoMessage::InstDepRecordHeader header_msg; if (!trace.read(header_msg)) { panic("Failed to read packet header from %s\n", filename); if (header_msg.tick_freq() != SimClock::Frequency) { panic("Trace %s was recorded with a different tick frequency %d\n", header_msg.tick_freq()); } } else { // Assign window size equal to the field in the trace that was recorded // when the data dependency trace was captured in the o3cpu model windowSize = header_msg.window_size(); } } void TraceCPU::ElasticDataGen::InputStream::reset() { trace.reset(); } bool TraceCPU::ElasticDataGen::InputStream::read(GraphNode* element) { ProtoMessage::InstDepRecord pkt_msg; if (trace.read(pkt_msg)) { // Required fields element->seqNum = pkt_msg.seq_num(); element->type = pkt_msg.type(); // Scale the compute delay to effectively scale the Trace CPU frequency element->compDelay = pkt_msg.comp_delay() * timeMultiplier; // Repeated field robDepList element->clearRobDep(); assert((pkt_msg.rob_dep()).size() <= element->maxRobDep); for (int i = 0; i < (pkt_msg.rob_dep()).size(); i++) { element->robDep[element->numRobDep] = pkt_msg.rob_dep(i); element->numRobDep += 1; } // Repeated field element->clearRegDep(); assert((pkt_msg.reg_dep()).size() <= TheISA::MaxInstSrcRegs); for (int i = 0; i < (pkt_msg.reg_dep()).size(); i++) { // There is a possibility that an instruction has both, a register // and order dependency on an instruction. In such a case, the // register dependency is omitted bool duplicate = false; for (int j = 0; j < element->numRobDep; j++) { duplicate |= (pkt_msg.reg_dep(i) == element->robDep[j]); } if (!duplicate) { element->regDep[element->numRegDep] = pkt_msg.reg_dep(i); element->numRegDep += 1; } } // Optional fields if (pkt_msg.has_p_addr()) element->physAddr = pkt_msg.p_addr(); else element->physAddr = 0; if (pkt_msg.has_v_addr()) element->virtAddr = pkt_msg.v_addr(); else element->virtAddr = 0; if (pkt_msg.has_asid()) element->asid = pkt_msg.asid(); else element->asid = 0; if (pkt_msg.has_size()) element->size = pkt_msg.size(); else element->size = 0; if (pkt_msg.has_flags()) element->flags = pkt_msg.flags(); else element->flags = 0; if (pkt_msg.has_pc()) element->pc = pkt_msg.pc(); else element->pc = 0; // ROB occupancy number ++microOpCount; if (pkt_msg.has_weight()) { microOpCount += pkt_msg.weight(); } element->robNum = microOpCount; return true; } // We have reached the end of the file return false; } bool TraceCPU::ElasticDataGen::GraphNode::removeRegDep(NodeSeqNum reg_dep) { for (auto& own_reg_dep : regDep) { if (own_reg_dep == reg_dep) { // If register dependency is found, make it zero and return true own_reg_dep = 0; assert(numRegDep > 0); --numRegDep; DPRINTFR(TraceCPUData, "\tFor %lli: Marking register dependency %lli " "done.\n", seqNum, reg_dep); return true; } } // Return false if the dependency is not found return false; } bool TraceCPU::ElasticDataGen::GraphNode::removeRobDep(NodeSeqNum rob_dep) { for (auto& own_rob_dep : robDep) { if (own_rob_dep == rob_dep) { // If the rob dependency is found, make it zero and return true own_rob_dep = 0; assert(numRobDep > 0); --numRobDep; DPRINTFR(TraceCPUData, "\tFor %lli: Marking ROB dependency %lli " "done.\n", seqNum, rob_dep); return true; } } return false; } void TraceCPU::ElasticDataGen::GraphNode::clearRegDep() { for (auto& own_reg_dep : regDep) { own_reg_dep = 0; } numRegDep = 0; } void TraceCPU::ElasticDataGen::GraphNode::clearRobDep() { for (auto& own_rob_dep : robDep) { own_rob_dep = 0; } numRobDep = 0; } bool TraceCPU::ElasticDataGen::GraphNode::removeDepOnInst(NodeSeqNum done_seq_num) { // If it is an rob dependency then remove it if (!removeRobDep(done_seq_num)) { // If it is not an rob dependency then it must be a register dependency // If the register dependency is not found, it violates an assumption // and must be caught by assert. bool regdep_found M5_VAR_USED = removeRegDep(done_seq_num); assert(regdep_found); } // Return true if the node is dependency free return (numRobDep == 0 && numRegDep == 0); } void TraceCPU::ElasticDataGen::GraphNode::writeElementAsTrace() const { DPRINTFR(TraceCPUData, "%lli", seqNum); DPRINTFR(TraceCPUData, ",%s", typeToStr()); if (isLoad() || isStore()) { DPRINTFR(TraceCPUData, ",%i", physAddr); DPRINTFR(TraceCPUData, ",%i", size); DPRINTFR(TraceCPUData, ",%i", flags); } DPRINTFR(TraceCPUData, ",%lli", compDelay); int i = 0; DPRINTFR(TraceCPUData, "robDep:"); while (robDep[i] != 0) { DPRINTFR(TraceCPUData, ",%lli", robDep[i]); i++; } i = 0; DPRINTFR(TraceCPUData, "regDep:"); while (regDep[i] != 0) { DPRINTFR(TraceCPUData, ",%lli", regDep[i]); i++; } auto child_itr = dependents.begin(); DPRINTFR(TraceCPUData, "dependents:"); while (child_itr != dependents.end()) { DPRINTFR(TraceCPUData, ":%lli", (*child_itr)->seqNum); child_itr++; } DPRINTFR(TraceCPUData, "\n"); } std::string TraceCPU::ElasticDataGen::GraphNode::typeToStr() const { return Record::RecordType_Name(type); } TraceCPU::FixedRetryGen::InputStream::InputStream(const std::string& filename) : trace(filename) { // Create a protobuf message for the header and read it from the stream ProtoMessage::PacketHeader header_msg; if (!trace.read(header_msg)) { panic("Failed to read packet header from %s\n", filename); if (header_msg.tick_freq() != SimClock::Frequency) { panic("Trace %s was recorded with a different tick frequency %d\n", header_msg.tick_freq()); } } } void TraceCPU::FixedRetryGen::InputStream::reset() { trace.reset(); } bool TraceCPU::FixedRetryGen::InputStream::read(TraceElement* element) { ProtoMessage::Packet pkt_msg; if (trace.read(pkt_msg)) { element->cmd = pkt_msg.cmd(); element->addr = pkt_msg.addr(); element->blocksize = pkt_msg.size(); element->tick = pkt_msg.tick(); element->flags = pkt_msg.has_flags() ? pkt_msg.flags() : 0; element->pc = pkt_msg.has_pc() ? pkt_msg.pc() : 0; return true; } // We have reached the end of the file return false; }