diff options
Diffstat (limited to 'src/cpu/trace/trace_cpu.hh')
-rw-r--r-- | src/cpu/trace/trace_cpu.hh | 1101 |
1 files changed, 1101 insertions, 0 deletions
diff --git a/src/cpu/trace/trace_cpu.hh b/src/cpu/trace/trace_cpu.hh new file mode 100644 index 000000000..3a869ebe0 --- /dev/null +++ b/src/cpu/trace/trace_cpu.hh @@ -0,0 +1,1101 @@ +/* + * Copyright (c) 2013 - 2015 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Radhika Jagtap + * Andreas Hansson + * Thomas Grass + */ + +#ifndef __CPU_TRACE_TRACE_CPU_HH__ +#define __CPU_TRACE_TRACE_CPU_HH__ + +#include <array> +#include <cstdint> +#include <queue> +#include <set> +#include <unordered_map> + +#include "arch/registers.hh" +#include "base/statistics.hh" +#include "cpu/base.hh" +#include "debug/TraceCPUData.hh" +#include "debug/TraceCPUInst.hh" +#include "params/TraceCPU.hh" +#include "proto/inst_dep_record.pb.h" +#include "proto/packet.pb.h" +#include "proto/protoio.hh" +#include "sim/sim_events.hh" + +/** + * The trace cpu replays traces generated using the elastic trace probe + * attached to the O3 CPU model. The elastic trace is an execution trace with + * register data dependencies and ordering dependencies annotated to it. The + * trace cpu also replays a fixed timestamp fetch trace that is also generated + * by the elastic trace probe. This trace cpu model aims at achieving faster + * simulation compared to the detailed cpu model and good correlation when the + * same trace is used for playback on different memory sub-systems. + * + * The TraceCPU inherits from BaseCPU so some virtual methods need to be + * defined. It has two port subclasses inherited from MasterPort for + * instruction and data ports. It issues the memory requests deducing the + * timing from the trace and without performing real execution of micro-ops. As + * soon as the last dependency for an instruction is complete, its + * computational delay, also provided in the input trace is added. The + * dependency-free nodes are maintained in a list, called 'ReadyList', ordered + * by ready time. Instructions which depend on load stall until the responses + * for read requests are received thus achieving elastic replay. If the + * dependency is not found when adding a new node, it is assumed complete. + * Thus, if this node is found to be completely dependency-free its issue time + * is calculated and it is added to the ready list immediately. This is + * encapsulated in the subclass ElasticDataGen. + * + * If ready nodes are issued in an unconstrained way there can be more nodes + * outstanding which results in divergence in timing compared to the O3CPU. + * Therefore, the Trace CPU also models hardware resources. A sub-class to + * model hardware resources contains the maximum sizes of load buffer, store + * buffer and ROB. If resources are not available, the node is not issued. Such + * nodes that are pending issue are held in the 'depFreeQueue' structure. + * + * Modeling the ROB size in the Trace CPU as a resource limitation is arguably + * the most important parameter of all resources. The ROB occupancy is + * estimated using the newly added field 'robNum'. We need to use ROB number as + * sequence number is at times much higher due to squashing and trace replay is + * focused on correct path modeling. + * + * A map called 'inFlightNodes' is added to track nodes that are not only in + * the readyList but also load nodes that are executed (and thus removed from + * readyList) but are not complete. ReadyList handles what and when to execute + * next node while the inFlightNodes is used for resource modelling. The oldest + * ROB number is updated when any node occupies the ROB or when an entry in the + * ROB is released. The ROB occupancy is equal to the difference in the ROB + * number of the newly dependency-free node and the oldest ROB number in + * flight. + * + * If no node depends on a non load/store node then there is no reason to + * track it in the dependency graph. We filter out such nodes but count them + * and add a weight field to the subsequent node that we do include in the + * trace. The weight field is used to model ROB occupancy during replay. + * + * The depFreeQueue is chosen to be FIFO so that child nodes which are in + * program order get pushed into it in that order and thus issued in program + * order, like in the O3CPU. This is also why the dependents is made a + * sequential container, std::set to std::vector. We only check head of the + * depFreeQueue as nodes are issued in order and blocking on head models that + * better than looping the entire queue. An alternative choice would be to + * inspect top N pending nodes where N is the issue-width. This is left for + * future as the timing correlation looks good as it is. + * + * At the start of an execution event, first we attempt to issue such pending + * nodes by checking if appropriate resources have become available. If yes, we + * compute the execute tick with respect to the time then. Then we proceed to + * complete nodes from the readyList. + * + * When a read response is received, sometimes a dependency on it that was + * supposed to be released when it was issued is still not released. This + * occurs because the dependent gets added to the graph after the read was + * sent. So the check is made less strict and the dependency is marked complete + * on read response instead of insisting that it should have been removed on + * read sent. + * + * There is a check for requests spanning two cache lines as this condition + * triggers an assert fail in the L1 cache. If it does then truncate the size + * to access only until the end of that line and ignore the remainder. + * Strictly-ordered requests are skipped and the dependencies on such requests + * are handled by simply marking them complete immediately. + * + * The simulated seconds can be calculated as the difference between the + * final_tick stat and the tickOffset stat. A CountedExitEvent that contains a + * static int belonging to the Trace CPU class as a down counter is used to + * implement multi Trace CPU simulation exit. + */ + +class TraceCPU : public BaseCPU +{ + + public: + TraceCPU(TraceCPUParams *params); + ~TraceCPU(); + + void init(); + + /** + * This is a pure virtual function in BaseCPU. As we don't know how many + * insts are in the trace but only know how how many micro-ops are we + * cannot count this stat. + * + * @return 0 + */ + Counter totalInsts() const + { + return 0; + } + + /** + * Return totalOps as the number of committed micro-ops plus the + * speculatively issued loads that are modelled in the TraceCPU replay. + * + * @return number of micro-ops i.e. nodes in the elastic data generator + */ + Counter totalOps() const + { + return dcacheGen.getMicroOpCount(); + } + + /* Pure virtual function in BaseCPU. Do nothing. */ + void wakeup(ThreadID tid = 0) + { + return; + } + + /* + * When resuming from checkpoint in FS mode, the TraceCPU takes over from + * the old cpu. This function overrides the takeOverFrom() function in the + * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the + * TraceCPU. + */ + void takeOverFrom(BaseCPU *oldCPU); + + /** + * When instruction cache port receives a retry, schedule event + * icacheNextEvent. + */ + void icacheRetryRecvd(); + + /** + * When data cache port receives a retry, schedule event + * dcacheNextEvent. + */ + void dcacheRetryRecvd(); + + /** + * When data cache port receives a response, this calls the dcache + * generator method handle to complete the load writeback. + * + * @param pkt Pointer to packet received + */ + void dcacheRecvTimingResp(PacketPtr pkt); + + /** + * Schedule event dcacheNextEvent at the given tick + * + * @param when Tick at which to schedule event + */ + void schedDcacheNextEvent(Tick when); + + protected: + + /** + * IcachePort class that interfaces with L1 Instruction Cache. + */ + class IcachePort : public MasterPort + { + public: + /** Default constructor. */ + IcachePort(TraceCPU* _cpu) + : MasterPort(_cpu->name() + ".icache_port", _cpu), + owner(_cpu) + { } + + public: + /** + * Receive the timing reponse and simply delete the packet since + * instruction fetch requests are issued as per the timing in the trace + * and responses are ignored. + * + * @param pkt Pointer to packet received + * @return true + */ + bool recvTimingResp(PacketPtr pkt); + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvTimingSnoopReq(PacketPtr pkt) { } + + /** + * Handle a retry signalled by the cache if instruction read failed in + * the first attempt. + */ + void recvReqRetry(); + + private: + TraceCPU* owner; + }; + + /** + * DcachePort class that interfaces with L1 Data Cache. + */ + class DcachePort : public MasterPort + { + + public: + /** Default constructor. */ + DcachePort(TraceCPU* _cpu) + : MasterPort(_cpu->name() + ".dcache_port", _cpu), + owner(_cpu) + { } + + public: + + /** + * Receive the timing reponse and call dcacheRecvTimingResp() method + * of the dcacheGen to handle completing the load + * + * @param pkt Pointer to packet received + * @return true + */ + bool recvTimingResp(PacketPtr pkt); + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvTimingSnoopReq(PacketPtr pkt) + { } + + /** + * Required functionally but do nothing. + * + * @param pkt Pointer to packet received + */ + void recvFunctionalSnoop(PacketPtr pkt) + { } + + /** + * Handle a retry signalled by the cache if data access failed in the + * first attempt. + */ + void recvReqRetry(); + + /** + * Required functionally. + * + * @return true since we have to snoop + */ + bool isSnooping() const { return true; } + + private: + TraceCPU* owner; + }; + + /** Port to connect to L1 instruction cache. */ + IcachePort icachePort; + + /** Port to connect to L1 data cache. */ + DcachePort dcachePort; + + /** Master id for instruction read requests. */ + const MasterID instMasterID; + + /** Master id for data read and write requests. */ + const MasterID dataMasterID; + + /** File names for input instruction and data traces. */ + std::string instTraceFile, dataTraceFile; + + /** + * Generator to read protobuf trace containing memory requests at fixed + * timestamps, perform flow control and issue memory requests. If L1 cache + * port sends packet succesfully, determine the tick to send the next + * packet else wait for retry from cache. + */ + class FixedRetryGen + { + + private: + + /** + * This struct stores a line in the trace file. + */ + struct TraceElement { + + /** Specifies if the request is to be a read or a write */ + MemCmd cmd; + + /** The address for the request */ + Addr addr; + + /** The size of the access for the request */ + Addr blocksize; + + /** The time at which the request should be sent */ + Tick tick; + + /** Potential request flags to use */ + Request::FlagsType flags; + + /** Instruction PC */ + Addr pc; + + /** + * Check validity of this element. + * + * @return if this element is valid + */ + bool isValid() const { + return cmd != MemCmd::InvalidCmd; + } + + /** + * Make this element invalid. + */ + void clear() { + cmd = MemCmd::InvalidCmd; + } + }; + + /** + * The InputStream encapsulates a trace file and the + * internal buffers and populates TraceElements based on + * the input. + */ + class InputStream + { + + private: + + // Input file stream for the protobuf trace + ProtoInputStream trace; + + public: + + /** + * Create a trace input stream for a given file name. + * + * @param filename Path to the file to read from + */ + InputStream(const std::string& filename); + + /** + * Reset the stream such that it can be played once + * again. + */ + void reset(); + + /** + * Attempt to read a trace element from the stream, + * and also notify the caller if the end of the file + * was reached. + * + * @param element Trace element to populate + * @return True if an element could be read successfully + */ + bool read(TraceElement* element); + }; + + public: + /* Constructor */ + FixedRetryGen(TraceCPU& _owner, const std::string& _name, + MasterPort& _port, MasterID master_id, + const std::string& trace_file) + : owner(_owner), + port(_port), + masterID(master_id), + trace(trace_file), + genName(owner.name() + ".fixedretry" + _name), + retryPkt(nullptr), + delta(0), + traceComplete(false) + { + } + + /** + * Called from TraceCPU init(). Reads the first message from the + * input trace file and returns the send tick. + * + * @return Tick when first packet must be sent + */ + Tick init(); + + /** + * This tries to send current or retry packet and returns true if + * successfull. It calls nextExecute() to read next message. + * + * @return bool true if packet is sent successfully + */ + bool tryNext(); + + /** Returns name of the FixedRetryGen instance. */ + const std::string& name() const { return genName; } + + /** + * Creates a new request assigning the request parameters passed by the + * arguments. Calls the port's sendTimingReq() and returns true if + * the packet was sent succesfully. It is called by tryNext() + * + * @param addr address of request + * @param size size of request + * @param cmd if it is a read or write request + * @param flags associated request flags + * @param pc instruction PC that generated the request + * + * @return true if packet was sent successfully + */ + bool send(Addr addr, unsigned size, const MemCmd& cmd, + Request::FlagsType flags, Addr pc); + + /** Exit the FixedRetryGen. */ + void exit(); + + /** + * Reads a line of the trace file. Returns the tick + * when the next request should be generated. If the end + * of the file has been reached, it returns false. + * + * @return bool false id end of file has been reached + */ + bool nextExecute(); + + /** + * Returns the traceComplete variable which is set when end of the + * input trace file is reached. + * + * @return bool true if traceComplete is set, false otherwise. + */ + bool isTraceComplete() { return traceComplete; } + + int64_t tickDelta() { return delta; } + + void regStats(); + + private: + + /** Reference of the TraceCPU. */ + TraceCPU& owner; + + /** Reference of the port to be used to issue memory requests. */ + MasterPort& port; + + /** MasterID used for the requests being sent. */ + const MasterID masterID; + + /** Input stream used for reading the input trace file. */ + InputStream trace; + + /** String to store the name of the FixedRetryGen. */ + std::string genName; + + /** PacketPtr used to store the packet to retry. */ + PacketPtr retryPkt; + + /** + * Stores the difference in the send ticks of the current and last + * packets. Keeping this signed to check overflow to a negative value + * which will be caught by assert(delta > 0) + */ + int64_t delta; + + /** + * Set to true when end of trace is reached. + */ + bool traceComplete; + + /** Store an element read from the trace to send as the next packet. */ + TraceElement currElement; + + /** Stats for instruction accesses replayed. */ + Stats::Scalar numSendAttempted; + Stats::Scalar numSendSucceeded; + Stats::Scalar numSendFailed; + Stats::Scalar numRetrySucceeded; + /** Last simulated tick by the FixedRetryGen */ + Stats::Scalar instLastTick; + + }; + + /** + * The elastic data memory request generator to read protobuf trace + * containing execution trace annotated with data and ordering + * dependencies. It deduces the time at which to send a load/store request + * by tracking the dependencies. It attempts to send a memory request for a + * load/store without performing real execution of micro-ops. If L1 cache + * port sends packet succesfully, the generator checks which instructions + * became dependency free as a result of this and schedules an event + * accordingly. If it fails to send the packet, it waits for a retry from + * the cache. + */ + class ElasticDataGen + { + + private: + + /** Node sequence number type. */ + typedef uint64_t NodeSeqNum; + + /** Node ROB number type. */ + typedef uint64_t NodeRobNum; + + /** + * The struct GraphNode stores an instruction in the trace file. The + * format of the trace file favours constructing a dependency graph of + * the execution and this struct is used to encapsulate the request + * data as well as pointers to its dependent GraphNodes. + */ + class GraphNode { + + public: + /** + * The maximum no. of ROB dependencies. There can be at most 2 + * order dependencies which could exist for a store. For a load + * and comp node there can be at most one order dependency. + */ + static const uint8_t maxRobDep = 2; + + /** Typedef for the array containing the ROB dependencies */ + typedef std::array<NodeSeqNum, maxRobDep> RobDepArray; + + /** Typedef for the array containing the register dependencies */ + typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray; + + /** Instruction sequence number */ + NodeSeqNum seqNum; + + /** ROB occupancy number */ + NodeRobNum robNum; + + /** If instruction is a load */ + bool isLoad; + + /** If instruction is a store */ + bool isStore; + + /** The address for the request if any */ + Addr addr; + + /** Size of request if any */ + uint32_t size; + + /** Request flags if any */ + Request::Flags flags; + + /** Instruction PC */ + Addr pc; + + /** Array of order dependencies. */ + RobDepArray robDep; + + /** Number of order dependencies */ + uint8_t numRobDep; + + /** Computational delay */ + uint64_t compDelay; + + /** + * Array of register dependencies (incoming) if any. Maximum number + * of source registers used to set maximum size of the array + */ + RegDepArray regDep; + + /** Number of register dependencies */ + uint8_t numRegDep; + + /** + * A vector of nodes dependent (outgoing) on this node. A + * sequential container is chosen because when dependents become + * free, they attempt to issue in program order. + */ + std::vector<GraphNode *> dependents; + + /** Initialize register dependency array to all zeroes */ + void clearRegDep(); + + /** Initialize register dependency array to all zeroes */ + void clearRobDep(); + + /** Remove completed instruction from register dependency array */ + bool removeRegDep(NodeSeqNum reg_dep); + + /** Remove completed instruction from order dependency array */ + bool removeRobDep(NodeSeqNum rob_dep); + + /** Check for all dependencies on completed inst */ + bool removeDepOnInst(NodeSeqNum done_seq_num); + + /** Return true if node has a request which is strictly ordered */ + bool isStrictlyOrdered() const { + return (flags.isSet(Request::STRICT_ORDER)); + } + /** + * Write out element in trace-compatible format using debug flag + * TraceCPUData. + */ + void writeElementAsTrace() const; + }; + + /** Struct to store a ready-to-execute node and its execution tick. */ + struct ReadyNode + { + /** The sequence number of the ready node */ + NodeSeqNum seqNum; + + /** The tick at which the ready node must be executed */ + Tick execTick; + }; + + /** + * The HardwareResource class models structures that hold the in-flight + * nodes. When a node becomes dependency free, first check if resources + * are available to issue it. + */ + class HardwareResource + { + public: + /** + * Constructor that initializes the sizes of the structures. + * + * @param max_rob size of the Reorder Buffer + * @param max_stores size of Store Buffer + * @param max_loads size of Load Buffer + */ + HardwareResource(uint16_t max_rob, uint16_t max_stores, + uint16_t max_loads); + + /** + * Occupy appropriate structures for an issued node. + * + * @param node_ptr pointer to the issued node + */ + void occupy(const GraphNode* new_node); + + /** + * Release appropriate structures for a completed node. + * + * @param node_ptr pointer to the completed node + */ + void release(const GraphNode* done_node); + + /** Release store buffer entry for a completed store */ + void releaseStoreBuffer(); + + /** + * Check if structures required to issue a node are free. + * + * @param node_ptr pointer to the node ready to issue + * @return true if resources are available + */ + bool isAvailable(const GraphNode* new_node) const; + + /** + * Check if there are any outstanding requests, i.e. requests for + * which we are yet to receive a response. + * + * @return true if there is at least one read or write request + * outstanding + */ + bool awaitingResponse() const; + + /** Print resource occupancy for debugging */ + void printOccupancy(); + + private: + /** + * The size of the ROB used to throttle the max. number of in-flight + * nodes. + */ + const uint16_t sizeROB; + + /** + * The size of store buffer. This is used to throttle the max. number + * of in-flight stores. + */ + const uint16_t sizeStoreBuffer; + + /** + * The size of load buffer. This is used to throttle the max. number + * of in-flight loads. + */ + const uint16_t sizeLoadBuffer; + + /** + * A map from the sequence number to the ROB number of the in- + * flight nodes. This includes all nodes that are in the readyList + * plus the loads for which a request has been sent which are not + * present in the readyList. But such loads are not yet complete + * and thus occupy resources. We need to query the oldest in-flight + * node and since a map container keeps all its keys sorted using + * the less than criterion, the first element is the in-flight node + * with the least sequence number, i.e. the oldest in-flight node. + */ + std::map<NodeSeqNum, NodeRobNum> inFlightNodes; + + /** The ROB number of the oldest in-flight node */ + NodeRobNum oldestInFlightRobNum; + + /** Number of ready loads for which request may or may not be sent */ + uint16_t numInFlightLoads; + + /** Number of ready stores for which request may or may not be sent */ + uint16_t numInFlightStores; + }; + + /** + * The InputStream encapsulates a trace file and the + * internal buffers and populates GraphNodes based on + * the input. + */ + class InputStream + { + + private: + + /** Input file stream for the protobuf trace */ + ProtoInputStream trace; + + /** Count of committed ops read from trace plus the filtered ops */ + uint64_t microOpCount; + + /** + * The window size that is read from the header of the protobuf + * trace and used to process the dependency trace + */ + uint32_t windowSize; + public: + + /** + * Create a trace input stream for a given file name. + * + * @param filename Path to the file to read from + */ + InputStream(const std::string& filename); + + /** + * Reset the stream such that it can be played once + * again. + */ + void reset(); + + /** + * Attempt to read a trace element from the stream, + * and also notify the caller if the end of the file + * was reached. + * + * @param element Trace element to populate + * @param size of register dependency array stored in the element + * @return True if an element could be read successfully + */ + bool read(GraphNode* element); + + /** Get window size from trace */ + uint32_t getWindowSize() const { return windowSize; } + + /** Get number of micro-ops modelled in the TraceCPU replay */ + uint64_t getMicroOpCount() const { return microOpCount; } + }; + + public: + /* Constructor */ + ElasticDataGen(TraceCPU& _owner, const std::string& _name, + MasterPort& _port, MasterID master_id, + const std::string& trace_file, uint16_t max_rob, + uint16_t max_stores, uint16_t max_loads) + : owner(_owner), + port(_port), + masterID(master_id), + trace(trace_file), + genName(owner.name() + ".elastic" + _name), + retryPkt(nullptr), + traceComplete(false), + nextRead(false), + execComplete(false), + windowSize(trace.getWindowSize()), + hwResource(max_rob, max_stores, max_loads) + { + DPRINTF(TraceCPUData, "Window size in the trace is %d.\n", + windowSize); + } + + /** + * Called from TraceCPU init(). Reads the first message from the + * input trace file and returns the send tick. + * + * @return Tick when first packet must be sent + */ + Tick init(); + + /** Returns name of the ElasticDataGen instance. */ + const std::string& name() const { return genName; } + + /** Exit the ElasticDataGen. */ + void exit(); + + /** + * Reads a line of the trace file. Returns the tick when the next + * request should be generated. If the end of the file has been + * reached, it returns false. + * + * @return bool false if end of file has been reached else true + */ + bool readNextWindow(); + + /** + * Iterate over the dependencies of a new node and add the new node + * to the list of dependents of the parent node. + * + * @param new_node new node to add to the graph + * @tparam dep_array the dependency array of type rob or register, + * that is to be iterated, and may get modified + * @param num_dep the number of dependencies set in the array + * which may get modified during iteration + */ + template<typename T> void addDepsOnParent(GraphNode *new_node, + T& dep_array, + uint8_t& num_dep); + + /** + * This is the main execute function which consumes nodes from the + * sorted readyList. First attempt to issue the pending dependency-free + * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into + * the readyList. Then iterate through the readyList and when a node + * has its execute tick equal to curTick(), execute it. If the node is + * a load or a store call executeMemReq() and if it is neither, simply + * mark it complete. + */ + void execute(); + + /** + * Creates a new request for a load or store assigning the request + * parameters. Calls the port's sendTimingReq() and returns a packet + * if the send failed so that it can be saved for a retry. + * + * @param node_ptr pointer to the load or store node to be executed + * + * @return packet pointer if the request failed and nullptr if it was + * sent successfully + */ + PacketPtr executeMemReq(GraphNode* node_ptr); + + /** + * Add a ready node to the readyList. When inserting, ensure the nodes + * are sorted in ascending order of their execute ticks. + * + * @param seq_num seq. num of ready node + * @param exec_tick the execute tick of the ready node + */ + void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick); + + /** Print readyList for debugging using debug flag TraceCPUData. */ + void printReadyList(); + + /** + * When a load writeback is received, that is when the load completes, + * release the dependents on it. This is called from the dcache port + * recvTimingResp(). + */ + void completeMemAccess(PacketPtr pkt); + + /** + * Returns the execComplete variable which is set when the last + * node is executed. + * + * @return bool true if execComplete is set, false otherwise. + */ + bool isExecComplete() const { return execComplete; } + + /** + * Attempts to issue a node once the node's source dependencies are + * complete. If resources are available then add it to the readyList, + * otherwise the node is not issued and is stored in depFreeQueue + * until resources become available. + * + * @param node_ptr pointer to node to be issued + * @param first true if this is the first attempt to issue this node + * @return true if node was added to readyList + */ + bool checkAndIssue(const GraphNode* node_ptr, bool first = true); + + /** Get number of micro-ops modelled in the TraceCPU replay */ + uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); } + + void regStats(); + + private: + + /** Reference of the TraceCPU. */ + TraceCPU& owner; + + /** Reference of the port to be used to issue memory requests. */ + MasterPort& port; + + /** MasterID used for the requests being sent. */ + const MasterID masterID; + + /** Input stream used for reading the input trace file. */ + InputStream trace; + + /** String to store the name of the FixedRetryGen. */ + std::string genName; + + /** PacketPtr used to store the packet to retry. */ + PacketPtr retryPkt; + + /** Set to true when end of trace is reached. */ + bool traceComplete; + + /** Set to true when the next window of instructions need to be read */ + bool nextRead; + + /** Set true when execution of trace is complete */ + bool execComplete; + + /** + * Window size within which to check for dependencies. Its value is + * made equal to the window size used to generate the trace which is + * recorded in the trace header. The dependency graph must be + * populated enough such that when a node completes, its potential + * child node must be found and the dependency removed before the + * completed node itself is removed. Thus as soon as the graph shrinks + * to become smaller than this window, we read in the next window. + */ + const uint32_t windowSize; + + /** + * Hardware resources required to contain in-flight nodes and to + * throttle issuing of new nodes when resources are not available. + */ + HardwareResource hwResource; + + /** Store the depGraph of GraphNodes */ + std::unordered_map<NodeSeqNum, GraphNode*> depGraph; + + /** + * Queue of dependency-free nodes that are pending issue because + * resources are not available. This is chosen to be FIFO so that + * dependent nodes which become free in program order get pushed + * into the queue in that order. Thus nodes are more likely to + * issue in program order. + */ + std::queue<const GraphNode*> depFreeQueue; + + /** List of nodes that are ready to execute */ + std::list<ReadyNode> readyList; + + /** Stats for data memory accesses replayed. */ + Stats::Scalar maxDependents; + Stats::Scalar maxReadyListSize; + Stats::Scalar numSendAttempted; + Stats::Scalar numSendSucceeded; + Stats::Scalar numSendFailed; + Stats::Scalar numRetrySucceeded; + Stats::Scalar numSplitReqs; + Stats::Scalar numSOLoads; + Stats::Scalar numSOStores; + /** Tick when ElasticDataGen completes execution */ + Stats::Scalar dataLastTick; + }; + + /** Instance of FixedRetryGen to replay instruction read requests. */ + FixedRetryGen icacheGen; + + /** Instance of ElasticDataGen to replay data read and write requests. */ + ElasticDataGen dcacheGen; + + /** + * This is the control flow that uses the functionality of the icacheGen to + * replay the trace. It calls tryNext(). If it returns true then next event + * is scheduled at curTick() plus delta. If it returns false then delta is + * ignored and control is brought back via recvRetry(). + */ + void schedIcacheNext(); + + /** + * This is the control flow that uses the functionality of the dcacheGen to + * replay the trace. It calls execute(). It checks if execution is complete + * and schedules an event to exit simulation accordingly. + */ + void schedDcacheNext(); + + /** Event for the control flow method schedIcacheNext() */ + EventWrapper<TraceCPU, &TraceCPU::schedIcacheNext> icacheNextEvent; + + /** Event for the control flow method schedDcacheNext() */ + EventWrapper<TraceCPU, &TraceCPU::schedDcacheNext> dcacheNextEvent; + + /** This is called when either generator finishes executing from the trace */ + void checkAndSchedExitEvent(); + + /** Set to true when one of the generators finishes replaying its trace. */ + bool oneTraceComplete; + + /** + * This is stores the tick of the first instruction fetch request + * which is later used for dumping the tickOffset stat. + */ + Tick firstFetchTick; + + /** + * Number of Trace CPUs in the system used as a shared variable and passed + * to the CountedExitEvent event used for counting down exit events. It is + * incremented in the constructor call so that the total is arrived at + * automatically. + */ + static int numTraceCPUs; + + /** + * A CountedExitEvent which when serviced decrements the counter. A sim + * exit event is scheduled when the counter equals zero, that is all + * instances of Trace CPU have had their execCompleteEvent serviced. + */ + CountedExitEvent *execCompleteEvent; + + Stats::Scalar numSchedDcacheEvent; + Stats::Scalar numSchedIcacheEvent; + + /** Stat for number of simulated micro-ops. */ + Stats::Scalar numOps; + /** Stat for the CPI. This is really cycles per micro-op and not inst. */ + Stats::Formula cpi; + + /** + * The first execution tick is dumped as a stat so that the simulated + * seconds for a trace replay can be calculated as a difference between the + * final_tick stat and the tickOffset stat + */ + Stats::Scalar tickOffset; + + public: + + /** Used to get a reference to the icache port. */ + MasterPort &getInstPort() { return icachePort; } + + /** Used to get a reference to the dcache port. */ + MasterPort &getDataPort() { return dcachePort; } + + void regStats(); +}; +#endif // __CPU_TRACE_TRACE_CPU_HH__ |