diff options
Diffstat (limited to 'src/cpu/o3')
41 files changed, 1028 insertions, 344 deletions
diff --git a/src/cpu/o3/2bit_local_pred.cc b/src/cpu/o3/2bit_local_pred.cc index 3d6a78bed..77a45ea26 100644 --- a/src/cpu/o3/2bit_local_pred.cc +++ b/src/cpu/o3/2bit_local_pred.cc @@ -33,9 +33,9 @@ #include "base/trace.hh" #include "cpu/o3/2bit_local_pred.hh" -DefaultBP::DefaultBP(unsigned _localPredictorSize, - unsigned _localCtrBits, - unsigned _instShiftAmt) +LocalBP::LocalBP(unsigned _localPredictorSize, + unsigned _localCtrBits, + unsigned _instShiftAmt) : localPredictorSize(_localPredictorSize), localCtrBits(_localCtrBits), instShiftAmt(_instShiftAmt) @@ -71,7 +71,7 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize, } void -DefaultBP::reset() +LocalBP::reset() { for (int i = 0; i < localPredictorSets; ++i) { localCtrs[i].reset(); @@ -79,21 +79,21 @@ DefaultBP::reset() } bool -DefaultBP::lookup(Addr &branch_addr) +LocalBP::lookup(Addr &branch_addr, void * &bp_history) { bool taken; - uint8_t local_prediction; + uint8_t counter_val; unsigned local_predictor_idx = getLocalIndex(branch_addr); DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n", local_predictor_idx); - local_prediction = localCtrs[local_predictor_idx].read(); + counter_val = localCtrs[local_predictor_idx].read(); DPRINTF(Fetch, "Branch predictor: prediction is %i.\n", - (int)local_prediction); + (int)counter_val); - taken = getPrediction(local_prediction); + taken = getPrediction(counter_val); #if 0 // Speculative update. @@ -110,8 +110,9 @@ DefaultBP::lookup(Addr &branch_addr) } void -DefaultBP::update(Addr &branch_addr, bool taken) +LocalBP::update(Addr &branch_addr, bool taken, void *bp_history) { + assert(bp_history == NULL); unsigned local_predictor_idx; // Update the local predictor. @@ -131,7 +132,7 @@ DefaultBP::update(Addr &branch_addr, bool taken) inline bool -DefaultBP::getPrediction(uint8_t &count) +LocalBP::getPrediction(uint8_t &count) { // Get the MSB of the count return (count >> (localCtrBits - 1)); @@ -139,7 +140,7 @@ DefaultBP::getPrediction(uint8_t &count) inline unsigned -DefaultBP::getLocalIndex(Addr &branch_addr) +LocalBP::getLocalIndex(Addr &branch_addr) { return (branch_addr >> instShiftAmt) & indexMask; } diff --git a/src/cpu/o3/2bit_local_pred.hh b/src/cpu/o3/2bit_local_pred.hh index 6e02a49be..0a2a71d3e 100644 --- a/src/cpu/o3/2bit_local_pred.hh +++ b/src/cpu/o3/2bit_local_pred.hh @@ -37,7 +37,14 @@ #include <vector> -class DefaultBP +/** + * Implements a local predictor that uses the PC to index into a table of + * counters. Note that any time a pointer to the bp_history is given, it + * should be NULL using this predictor because it does not have any branch + * predictor state that needs to be recorded or updated; the update can be + * determined solely by the branch being taken or not taken. + */ +class LocalBP { public: /** @@ -46,28 +53,31 @@ class DefaultBP * @param localCtrBits Number of bits per counter. * @param instShiftAmt Offset amount for instructions to ignore alignment. */ - DefaultBP(unsigned localPredictorSize, unsigned localCtrBits, - unsigned instShiftAmt); + LocalBP(unsigned localPredictorSize, unsigned localCtrBits, + unsigned instShiftAmt); /** * Looks up the given address in the branch predictor and returns * a true/false value as to whether it is taken. * @param branch_addr The address of the branch to look up. + * @param bp_history Pointer to any bp history state. * @return Whether or not the branch is taken. */ - bool lookup(Addr &branch_addr); + bool lookup(Addr &branch_addr, void * &bp_history); /** * Updates the branch predictor with the actual result of a branch. * @param branch_addr The address of the branch to update. * @param taken Whether or not the branch was taken. */ - void update(Addr &branch_addr, bool taken); + void update(Addr &branch_addr, bool taken, void *bp_history); + + void squash(void *bp_history) + { assert(bp_history == NULL); } void reset(); private: - /** * Returns the taken/not taken prediction given the value of the * counter. diff --git a/src/cpu/o3/alpha_cpu.hh b/src/cpu/o3/alpha_cpu.hh index 4d889866a..2e5c856a8 100644 --- a/src/cpu/o3/alpha_cpu.hh +++ b/src/cpu/o3/alpha_cpu.hh @@ -43,6 +43,14 @@ namespace Kernel { class TranslatingPort; +/** + * AlphaFullCPU class. Derives from the FullO3CPU class, and + * implements all ISA and implementation specific functions of the + * CPU. This is the CPU class that is used for the SimObjects, and is + * what is given to the DynInsts. Most of its state exists in the + * FullO3CPU; the state is has is mainly for ISA specific + * functionality. + */ template <class Impl> class AlphaFullCPU : public FullO3CPU<Impl> { @@ -62,83 +70,120 @@ class AlphaFullCPU : public FullO3CPU<Impl> /** Constructs an AlphaFullCPU with the given parameters. */ AlphaFullCPU(Params *params); + /** + * Derived ExecContext class for use with the AlphaFullCPU. It + * provides the interface for any external objects to access a + * single thread's state and some general CPU state. Any time + * external objects try to update state through this interface, + * the CPU will create an event to squash all in-flight + * instructions in order to ensure state is maintained correctly. + */ class AlphaXC : public ExecContext { public: + /** Pointer to the CPU. */ AlphaFullCPU<Impl> *cpu; + /** Pointer to the thread state that this XC corrseponds to. */ O3ThreadState<Impl> *thread; + /** Returns a pointer to this CPU. */ virtual BaseCPU *getCpuPtr() { return cpu; } + /** Sets this CPU's ID. */ virtual void setCpuId(int id) { cpu->cpu_id = id; } + /** Reads this CPU's ID. */ virtual int readCpuId() { return cpu->cpu_id; } virtual TranslatingPort *getMemPort() { return /*thread->port*/ NULL; } #if FULL_SYSTEM + /** Returns a pointer to the system. */ virtual System *getSystemPtr() { return cpu->system; } + /** Returns a pointer to physical memory. */ virtual PhysicalMemory *getPhysMemPtr() { return cpu->physmem; } + /** Returns a pointer to the ITB. */ virtual AlphaITB *getITBPtr() { return cpu->itb; } - virtual AlphaDTB * getDTBPtr() { return cpu->dtb; } + /** Returns a pointer to the DTB. */ + virtual AlphaDTB *getDTBPtr() { return cpu->dtb; } + /** Returns a pointer to this thread's kernel statistics. */ virtual Kernel::Statistics *getKernelStats() { return thread->kernelStats; } #else + /** Returns a pointer to this thread's process. */ virtual Process *getProcessPtr() { return thread->process; } #endif - + /** Returns this thread's status. */ virtual Status status() const { return thread->status(); } + /** Sets this thread's status. */ virtual void setStatus(Status new_status) { thread->setStatus(new_status); } - /// Set the status to Active. Optional delay indicates number of - /// cycles to wait before beginning execution. + /** Set the status to Active. Optional delay indicates number of + * cycles to wait before beginning execution. */ virtual void activate(int delay = 1); - /// Set the status to Suspended. + /** Set the status to Suspended. */ virtual void suspend(); - /// Set the status to Unallocated. + /** Set the status to Unallocated. */ virtual void deallocate(); - /// Set the status to Halted. + /** Set the status to Halted. */ virtual void halt(); #if FULL_SYSTEM + /** Dumps the function profiling information. + * @todo: Implement. + */ virtual void dumpFuncProfile(); #endif - + /** Takes over execution of a thread from another CPU. */ virtual void takeOverFrom(ExecContext *old_context); + /** Registers statistics associated with this XC. */ virtual void regStats(const std::string &name); + /** Serializes state. */ virtual void serialize(std::ostream &os); + /** Unserializes state. */ virtual void unserialize(Checkpoint *cp, const std::string §ion); #if FULL_SYSTEM + /** Returns pointer to the quiesce event. */ virtual EndQuiesceEvent *getQuiesceEvent(); + /** Reads the last tick that this thread was activated on. */ virtual Tick readLastActivate(); + /** Reads the last tick that this thread was suspended on. */ virtual Tick readLastSuspend(); + /** Clears the function profiling information. */ virtual void profileClear(); + /** Samples the function profiling information. */ virtual void profileSample(); #endif - + /** Returns this thread's ID number. */ virtual int getThreadNum() { return thread->tid; } + /** Returns the instruction this thread is currently committing. + * Only used when an instruction faults. + */ virtual TheISA::MachInst getInst(); + /** Copies the architectural registers from another XC into this XC. */ virtual void copyArchRegs(ExecContext *xc); + /** Resets all architectural registers to 0. */ virtual void clearArchRegs(); + /** Reads an integer register. */ virtual uint64_t readIntReg(int reg_idx); virtual FloatReg readFloatReg(int reg_idx, int width); @@ -149,6 +194,7 @@ class AlphaFullCPU : public FullO3CPU<Impl> virtual FloatRegBits readFloatRegBits(int reg_idx); + /** Sets an integer register to a value. */ virtual void setIntReg(int reg_idx, uint64_t val); virtual void setFloatReg(int reg_idx, FloatReg val, int width); @@ -159,14 +205,18 @@ class AlphaFullCPU : public FullO3CPU<Impl> virtual void setFloatRegBits(int reg_idx, FloatRegBits val); + /** Reads this thread's PC. */ virtual uint64_t readPC() { return cpu->readPC(thread->tid); } + /** Sets this thread's PC. */ virtual void setPC(uint64_t val); + /** Reads this thread's next PC. */ virtual uint64_t readNextPC() { return cpu->readNextPC(thread->tid); } + /** Sets this thread's next PC. */ virtual void setNextPC(uint64_t val); virtual uint64_t readNextNPC() @@ -178,43 +228,60 @@ class AlphaFullCPU : public FullO3CPU<Impl> virtual void setNextNPC(uint64_t val) { panic("Alpha has no NextNPC!"); } + /** Reads a miscellaneous register. */ virtual MiscReg readMiscReg(int misc_reg) { return cpu->readMiscReg(misc_reg, thread->tid); } + /** Reads a misc. register, including any side-effects the + * read might have as defined by the architecture. */ virtual MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault) { return cpu->readMiscRegWithEffect(misc_reg, fault, thread->tid); } + /** Sets a misc. register. */ virtual Fault setMiscReg(int misc_reg, const MiscReg &val); + /** Sets a misc. register, including any side-effects the + * write might have as defined by the architecture. */ virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val); + /** Returns the number of consecutive store conditional failures. */ // @todo: Figure out where these store cond failures should go. virtual unsigned readStCondFailures() { return thread->storeCondFailures; } + /** Sets the number of consecutive store conditional failures. */ virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; } #if FULL_SYSTEM + /** Returns if the thread is currently in PAL mode, based on + * the PC's value. */ virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); } #endif - // Only really makes sense for old CPU model. Lots of code // outside the CPU still checks this function, so it will // always return false to keep everything working. + /** Checks if the thread is misspeculating. Because it is + * very difficult to determine if the thread is + * misspeculating, this is set as false. */ virtual bool misspeculating() { return false; } #if !FULL_SYSTEM + /** Gets a syscall argument by index. */ virtual IntReg getSyscallArg(int i); + /** Sets a syscall argument. */ virtual void setSyscallArg(int i, IntReg val); + /** Sets the syscall return value. */ virtual void setSyscallReturn(SyscallReturn return_value); + /** Executes a syscall in SE mode. */ virtual void syscall(int64_t callnum) { return cpu->syscall(callnum, thread->tid); } + /** Reads the funcExeInst counter. */ virtual Counter readFuncExeInst() { return thread->funcExeInst; } #endif virtual void changeRegFileContext(TheISA::RegFile::ContextParam param, @@ -274,19 +341,32 @@ class AlphaFullCPU : public FullO3CPU<Impl> } #endif + /** Reads a miscellaneous register. */ MiscReg readMiscReg(int misc_reg, unsigned tid); + /** Reads a misc. register, including any side effects the read + * might have as defined by the architecture. + */ MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault, unsigned tid); + /** Sets a miscellaneous register. */ Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned tid); + /** Sets a misc. register, including any side effects the write + * might have as defined by the architecture. + */ Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val, unsigned tid); + /** Initiates a squash of all in-flight instructions for a given + * thread. The source of the squash is an external update of + * state through the XC. + */ void squashFromXC(unsigned tid); #if FULL_SYSTEM + /** Posts an interrupt. */ void post_interrupt(int int_num, int index); - + /** Reads the interrupt flag. */ int readIntrFlag(); /** Sets the interrupt flags. */ void setIntrFlag(int val); @@ -312,7 +392,7 @@ class AlphaFullCPU : public FullO3CPU<Impl> /** Executes a syscall. * @todo: Determine if this needs to be virtual. */ - void syscall(int64_t callnum, int thread_num); + void syscall(int64_t callnum, int tid); /** Gets a syscall argument. */ IntReg getSyscallArg(int i, int tid); @@ -438,6 +518,7 @@ class AlphaFullCPU : public FullO3CPU<Impl> Addr lockAddr; + /** Temporary fix for the lock flag, works in the UP case. */ bool lockFlag; }; diff --git a/src/cpu/o3/alpha_cpu_builder.cc b/src/cpu/o3/alpha_cpu_builder.cc index c91e2c7c9..1592261de 100644 --- a/src/cpu/o3/alpha_cpu_builder.cc +++ b/src/cpu/o3/alpha_cpu_builder.cc @@ -107,6 +107,7 @@ Param<unsigned> squashWidth; Param<Tick> trapLatency; Param<Tick> fetchTrapLatency; +Param<std::string> predType; Param<unsigned> localPredictorSize; Param<unsigned> localCtrBits; Param<unsigned> localHistoryTableSize; @@ -229,6 +230,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU) INIT_PARAM_DFLT(trapLatency, "Number of cycles before the trap is handled", 6), INIT_PARAM_DFLT(fetchTrapLatency, "Number of cycles before the fetch trap is handled", 12), + INIT_PARAM(predType, "Type of branch predictor ('local', 'tournament')"), INIT_PARAM(localPredictorSize, "Size of local predictor"), INIT_PARAM(localCtrBits, "Bits per counter"), INIT_PARAM(localHistoryTableSize, "Size of local history table"), @@ -359,6 +361,7 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU) params->trapLatency = trapLatency; params->fetchTrapLatency = fetchTrapLatency; + params->predType = predType; params->localPredictorSize = localPredictorSize; params->localCtrBits = localCtrBits; params->localHistoryTableSize = localHistoryTableSize; diff --git a/src/cpu/o3/alpha_cpu_impl.hh b/src/cpu/o3/alpha_cpu_impl.hh index 6893f8c64..ad4401f7e 100644 --- a/src/cpu/o3/alpha_cpu_impl.hh +++ b/src/cpu/o3/alpha_cpu_impl.hh @@ -60,10 +60,12 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params) { DPRINTF(FullCPU, "AlphaFullCPU: Creating AlphaFullCPU object.\n"); + // Setup any thread state. this->thread.resize(this->numThreads); for (int i = 0; i < this->numThreads; ++i) { #if FULL_SYSTEM + // SMT is not supported in FS mode yet. assert(this->numThreads == 1); this->thread[i] = new Thread(this, 0, params->mem); this->thread[i]->setStatus(ExecContext::Suspended); @@ -86,29 +88,34 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params) } #endif // !FULL_SYSTEM - this->thread[i]->numInst = 0; - ExecContext *xc_proxy; - AlphaXC *alpha_xc_proxy = new AlphaXC; + // Setup the XC that will serve as the interface to the threads/CPU. + AlphaXC *alpha_xc = new AlphaXC; + // If we're using a checker, then the XC should be the + // CheckerExecContext. if (params->checker) { - xc_proxy = new CheckerExecContext<AlphaXC>(alpha_xc_proxy, this->checker); + xc_proxy = new CheckerExecContext<AlphaXC>( + alpha_xc, this->checker); } else { - xc_proxy = alpha_xc_proxy; + xc_proxy = alpha_xc; } - alpha_xc_proxy->cpu = this; - alpha_xc_proxy->thread = this->thread[i]; + alpha_xc->cpu = this; + alpha_xc->thread = this->thread[i]; #if FULL_SYSTEM + // Setup quiesce event. this->thread[i]->quiesceEvent = new EndQuiesceEvent(xc_proxy); this->thread[i]->lastActivate = 0; this->thread[i]->lastSuspend = 0; #endif + // Give the thread the XC. this->thread[i]->xcProxy = xc_proxy; + // Add the XC to the CPU's list of XC's. this->execContexts.push_back(xc_proxy); } @@ -170,6 +177,7 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context) setStatus(old_context->status()); copyArchRegs(old_context); setCpuId(old_context->readCpuId()); + #if !FULL_SYSTEM thread->funcExeInst = old_context->readFuncExeInst(); #else @@ -391,7 +399,6 @@ template <class Impl> uint64_t AlphaFullCPU<Impl>::AlphaXC::readIntReg(int reg_idx) { - DPRINTF(Fault, "Reading int register through the XC!\n"); return cpu->readArchIntReg(reg_idx, thread->tid); } @@ -399,7 +406,6 @@ template <class Impl> FloatReg AlphaFullCPU<Impl>::AlphaXC::readFloatReg(int reg_idx, int width) { - DPRINTF(Fault, "Reading float register through the XC!\n"); switch(width) { case 32: return cpu->readArchFloatRegSingle(reg_idx, thread->tid); @@ -415,7 +421,6 @@ template <class Impl> FloatReg AlphaFullCPU<Impl>::AlphaXC::readFloatReg(int reg_idx) { - DPRINTF(Fault, "Reading float register through the XC!\n"); return cpu->readArchFloatRegSingle(reg_idx, thread->tid); } @@ -431,7 +436,6 @@ template <class Impl> FloatRegBits AlphaFullCPU<Impl>::AlphaXC::readFloatRegBits(int reg_idx) { - DPRINTF(Fault, "Reading floatint register through the XC!\n"); return cpu->readArchFloatRegInt(reg_idx, thread->tid); } @@ -439,9 +443,9 @@ template <class Impl> void AlphaFullCPU<Impl>::AlphaXC::setIntReg(int reg_idx, uint64_t val) { - DPRINTF(Fault, "Setting int register through the XC!\n"); cpu->setArchIntReg(reg_idx, val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -451,7 +455,6 @@ template <class Impl> void AlphaFullCPU<Impl>::AlphaXC::setFloatReg(int reg_idx, FloatReg val, int width) { - DPRINTF(Fault, "Setting float register through the XC!\n"); switch(width) { case 32: cpu->setArchFloatRegSingle(reg_idx, val, thread->tid); @@ -461,6 +464,7 @@ AlphaFullCPU<Impl>::AlphaXC::setFloatReg(int reg_idx, FloatReg val, int width) break; } + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -470,7 +474,6 @@ template <class Impl> void AlphaFullCPU<Impl>::AlphaXC::setFloatReg(int reg_idx, FloatReg val) { - DPRINTF(Fault, "Setting float register through the XC!\n"); cpu->setArchFloatRegSingle(reg_idx, val, thread->tid); if (!thread->trapPending && !thread->inSyscall) { @@ -486,6 +489,7 @@ AlphaFullCPU<Impl>::AlphaXC::setFloatRegBits(int reg_idx, FloatRegBits val, DPRINTF(Fault, "Setting floatint register through the XC!\n"); cpu->setArchFloatRegInt(reg_idx, val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -495,9 +499,9 @@ template <class Impl> void AlphaFullCPU<Impl>::AlphaXC::setFloatRegBits(int reg_idx, FloatRegBits val) { - DPRINTF(Fault, "Setting floatint register through the XC!\n"); cpu->setArchFloatRegInt(reg_idx, val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -509,6 +513,7 @@ AlphaFullCPU<Impl>::AlphaXC::setPC(uint64_t val) { cpu->setPC(val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -520,6 +525,7 @@ AlphaFullCPU<Impl>::AlphaXC::setNextPC(uint64_t val) { cpu->setNextPC(val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -529,10 +535,9 @@ template <class Impl> Fault AlphaFullCPU<Impl>::AlphaXC::setMiscReg(int misc_reg, const MiscReg &val) { - DPRINTF(Fault, "Setting misc register through the XC!\n"); - Fault ret_fault = cpu->setMiscReg(misc_reg, val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -542,12 +547,12 @@ AlphaFullCPU<Impl>::AlphaXC::setMiscReg(int misc_reg, const MiscReg &val) template <class Impl> Fault -AlphaFullCPU<Impl>::AlphaXC::setMiscRegWithEffect(int misc_reg, const MiscReg &val) +AlphaFullCPU<Impl>::AlphaXC::setMiscRegWithEffect(int misc_reg, + const MiscReg &val) { - DPRINTF(Fault, "Setting misc register through the XC!\n"); - Fault ret_fault = cpu->setMiscRegWithEffect(misc_reg, val, thread->tid); + // Squash if we're not already in a state update mode. if (!thread->trapPending && !thread->inSyscall) { cpu->squashFromXC(thread->tid); } @@ -628,7 +633,6 @@ AlphaFullCPU<Impl>::post_interrupt(int int_num, int index) if (this->thread[0]->status() == ExecContext::Suspended) { DPRINTF(IPI,"Suspended Processor awoke\n"); -// xcProxies[0]->activate(); this->execContexts[0]->activate(); } } @@ -691,6 +695,7 @@ template <class Impl> void AlphaFullCPU<Impl>::trap(Fault fault, unsigned tid) { + // Pass the thread's XC into the invoke method. fault->invoke(this->execContexts[tid]); } @@ -741,6 +746,7 @@ AlphaFullCPU<Impl>::processInterrupts() if (ipl && ipl > this->readMiscReg(IPR_IPLR, 0)) { this->setMiscReg(IPR_ISR, summary, 0); this->setMiscReg(IPR_INTID, ipl, 0); + // Checker needs to know these two registers were updated. if (this->checker) { this->checker->cpuXCBase()->setMiscReg(IPR_ISR, summary); this->checker->cpuXCBase()->setMiscReg(IPR_INTID, ipl); diff --git a/src/cpu/o3/alpha_dyn_inst.hh b/src/cpu/o3/alpha_dyn_inst.hh index af2858802..143ffe7e4 100644 --- a/src/cpu/o3/alpha_dyn_inst.hh +++ b/src/cpu/o3/alpha_dyn_inst.hh @@ -93,23 +93,31 @@ class AlphaDynInst : public BaseDynInst<Impl> void initVars(); public: + /** Reads a miscellaneous register. */ MiscReg readMiscReg(int misc_reg) { return this->cpu->readMiscReg(misc_reg, this->threadNumber); } + /** Reads a misc. register, including any side-effects the read + * might have as defined by the architecture. + */ MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault) { return this->cpu->readMiscRegWithEffect(misc_reg, fault, this->threadNumber); } + /** Sets a misc. register. */ Fault setMiscReg(int misc_reg, const MiscReg &val) { this->instResult.integer = val; return this->cpu->setMiscReg(misc_reg, val, this->threadNumber); } + /** Sets a misc. register, including any side-effects the write + * might have as defined by the architecture. + */ Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val) { return this->cpu->setMiscRegWithEffect(misc_reg, val, diff --git a/src/cpu/o3/alpha_dyn_inst_impl.hh b/src/cpu/o3/alpha_dyn_inst_impl.hh index 06755eb76..3a0727b45 100644 --- a/src/cpu/o3/alpha_dyn_inst_impl.hh +++ b/src/cpu/o3/alpha_dyn_inst_impl.hh @@ -66,9 +66,10 @@ template <class Impl> Fault AlphaDynInst<Impl>::execute() { - // @todo: Pretty convoluted way to avoid squashing from happening when using - // the XC during an instruction's execution (specifically for instructions - // that have sideeffects that use the XC). Fix this. + // @todo: Pretty convoluted way to avoid squashing from happening + // when using the XC during an instruction's execution + // (specifically for instructions that have side-effects that use + // the XC). Fix this. bool in_syscall = this->thread->inSyscall; this->thread->inSyscall = true; @@ -83,9 +84,10 @@ template <class Impl> Fault AlphaDynInst<Impl>::initiateAcc() { - // @todo: Pretty convoluted way to avoid squashing from happening when using - // the XC during an instruction's execution (specifically for instructions - // that have sideeffects that use the XC). Fix this. + // @todo: Pretty convoluted way to avoid squashing from happening + // when using the XC during an instruction's execution + // (specifically for instructions that have side-effects that use + // the XC). Fix this. bool in_syscall = this->thread->inSyscall; this->thread->inSyscall = true; @@ -118,9 +120,11 @@ template <class Impl> Fault AlphaDynInst<Impl>::hwrei() { + // Can only do a hwrei when in pal mode. if (!this->cpu->inPalMode(this->readPC())) return new AlphaISA::UnimplementedOpcodeFault; + // Set the next PC based on the value of the EXC_ADDR IPR. this->setNextPC(this->cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR, this->threadNumber)); diff --git a/src/cpu/o3/alpha_params.hh b/src/cpu/o3/alpha_params.hh index 8c6779495..e48abd9ed 100644 --- a/src/cpu/o3/alpha_params.hh +++ b/src/cpu/o3/alpha_params.hh @@ -126,8 +126,9 @@ class AlphaSimpleParams : public BaseFullCPU::Params Tick fetchTrapLatency; // - // Branch predictor (BP & BTB) + // Branch predictor (BP, BTB, RAS) // + std::string predType; unsigned localPredictorSize; unsigned localCtrBits; unsigned localHistoryTableSize; diff --git a/src/cpu/o3/bpred_unit.cc b/src/cpu/o3/bpred_unit.cc index d0af5af92..b33543bdc 100644 --- a/src/cpu/o3/bpred_unit.cc +++ b/src/cpu/o3/bpred_unit.cc @@ -34,6 +34,6 @@ #include "cpu/ozone/ozone_impl.hh" //#include "cpu/ozone/simple_impl.hh" -template class TwobitBPredUnit<AlphaSimpleImpl>; -template class TwobitBPredUnit<OzoneImpl>; -//template class TwobitBPredUnit<SimpleImpl>; +template class BPredUnit<AlphaSimpleImpl>; +template class BPredUnit<OzoneImpl>; +//template class BPredUnit<SimpleImpl>; diff --git a/src/cpu/o3/bpred_unit.hh b/src/cpu/o3/bpred_unit.hh index 39a88a3ce..2c0a39565 100644 --- a/src/cpu/o3/bpred_unit.hh +++ b/src/cpu/o3/bpred_unit.hh @@ -48,16 +48,25 @@ * and the BTB. */ template<class Impl> -class TwobitBPredUnit +class BPredUnit { - public: + private: typedef typename Impl::Params Params; typedef typename Impl::DynInstPtr DynInstPtr; + enum PredType { + Local, + Tournament + }; + + PredType predictor; + + public: + /** * @param params The params object, that has the size of the BP and BTB. */ - TwobitBPredUnit(Params *params); + BPredUnit(Params *params); /** * Registers statistics. @@ -78,6 +87,9 @@ class TwobitBPredUnit */ bool predict(DynInstPtr &inst, Addr &PC, unsigned tid); + // @todo: Rename this function. + void BPUncond(void * &bp_history); + /** * Tells the branch predictor to commit any updates until the given * sequence number. @@ -107,12 +119,19 @@ class TwobitBPredUnit bool actually_taken, unsigned tid); /** + * @param bp_history Pointer to the history object. The predictor + * will need to update any state and delete the object. + */ + void BPSquash(void *bp_history); + + /** * Looks up a given PC in the BP to see if it is taken or not taken. * @param inst_PC The PC to look up. + * @param bp_history Pointer that will be set to an object that + * has the branch predictor state associated with the lookup. * @return Whether the branch is taken or not taken. */ - bool BPLookup(Addr &inst_PC) - { return BP.lookup(inst_PC); } + bool BPLookup(Addr &inst_PC, void * &bp_history); /** * Looks up a given PC in the BTB to see if a matching entry exists. @@ -134,10 +153,11 @@ class TwobitBPredUnit * Updates the BP with taken/not taken information. * @param inst_PC The branch's PC that will be updated. * @param taken Whether the branch was taken or not taken. + * @param bp_history Pointer to the branch predictor state that is + * associated with the branch lookup that is being updated. * @todo Make this update flexible enough to handle a global predictor. */ - void BPUpdate(Addr &inst_PC, bool taken) - { BP.update(inst_PC, taken); } + void BPUpdate(Addr &inst_PC, bool taken, void *bp_history); /** * Updates the BTB with the target of a branch. @@ -147,18 +167,20 @@ class TwobitBPredUnit void BTBUpdate(Addr &inst_PC, Addr &target_PC) { BTB.update(inst_PC, target_PC,0); } + void dump(); + private: struct PredictorHistory { /** - * Makes a predictor history struct that contains a sequence number, - * the PC of its instruction, and whether or not it was predicted - * taken. + * Makes a predictor history struct that contains any + * information needed to update the predictor, BTB, and RAS. */ PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC, - const bool pred_taken, const unsigned _tid) - : seqNum(seq_num), PC(inst_PC), RASTarget(0), globalHistory(0), + const bool pred_taken, void *bp_history, + const unsigned _tid) + : seqNum(seq_num), PC(inst_PC), RASTarget(0), RASIndex(0), tid(_tid), predTaken(pred_taken), usedRAS(0), - wasCall(0) + wasCall(0), bpHistory(bp_history) { } /** The sequence number for the predictor history entry. */ @@ -170,9 +192,6 @@ class TwobitBPredUnit /** The RAS target (only valid if a return). */ Addr RASTarget; - /** The global history at the time this entry was created. */ - unsigned globalHistory; - /** The RAS index of the instruction (only valid if a call). */ unsigned RASIndex; @@ -187,6 +206,12 @@ class TwobitBPredUnit /** Whether or not the instruction was a call. */ bool wasCall; + + /** Pointer to the history object passed back from the branch + * predictor. It is used to update or restore state of the + * branch predictor. + */ + void *bpHistory; }; typedef std::list<PredictorHistory> History; @@ -198,8 +223,11 @@ class TwobitBPredUnit */ History predHist[Impl::MaxThreads]; - /** The branch predictor. */ - DefaultBP BP; + /** The local branch predictor. */ + LocalBP *localBP; + + /** The tournament branch predictor. */ + TournamentBP *tournamentBP; /** The BTB. */ DefaultBTB BTB; diff --git a/src/cpu/o3/bpred_unit_impl.hh b/src/cpu/o3/bpred_unit_impl.hh index cde9f28ab..0da02145b 100644 --- a/src/cpu/o3/bpred_unit_impl.hh +++ b/src/cpu/o3/bpred_unit_impl.hh @@ -38,21 +38,40 @@ using namespace std; template<class Impl> -TwobitBPredUnit<Impl>::TwobitBPredUnit(Params *params) - : BP(params->localPredictorSize, - params->localCtrBits, - params->instShiftAmt), - BTB(params->BTBEntries, +BPredUnit<Impl>::BPredUnit(Params *params) + : BTB(params->BTBEntries, params->BTBTagSize, params->instShiftAmt) { + // Setup the selected predictor. + if (params->predType == "local") { + localBP = new LocalBP(params->localPredictorSize, + params->localCtrBits, + params->instShiftAmt); + predictor = Local; + } else if (params->predType == "tournament") { + tournamentBP = new TournamentBP(params->localPredictorSize, + params->localCtrBits, + params->localHistoryTableSize, + params->localHistoryBits, + params->globalPredictorSize, + params->globalHistoryBits, + params->globalCtrBits, + params->choicePredictorSize, + params->choiceCtrBits, + params->instShiftAmt); + predictor = Tournament; + } else { + fatal("Invalid BP selected!"); + } + for (int i=0; i < Impl::MaxThreads; i++) RAS[i].init(params->RASSize); } template <class Impl> void -TwobitBPredUnit<Impl>::regStats() +BPredUnit<Impl>::regStats() { lookups .name(name() + ".BPredUnit.lookups") @@ -98,17 +117,20 @@ TwobitBPredUnit<Impl>::regStats() template <class Impl> void -TwobitBPredUnit<Impl>::switchOut() +BPredUnit<Impl>::switchOut() { + // Clear any state upon switch out. for (int i = 0; i < Impl::MaxThreads; ++i) { - predHist[i].clear(); + squash(0, i); } } template <class Impl> void -TwobitBPredUnit<Impl>::takeOverFrom() +BPredUnit<Impl>::takeOverFrom() { + // Can reset all predictor state, but it's not necessarily better + // than leaving it be. /* for (int i = 0; i < Impl::MaxThreads; ++i) RAS[i].reset(); @@ -120,11 +142,10 @@ TwobitBPredUnit<Impl>::takeOverFrom() template <class Impl> bool -TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) +BPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) { // See if branch predictor predicts taken. // If so, get its target addr either from the BTB or the RAS. - // Once that's done, speculatively update the predictor? // Save off record of branch stuff so the RAS can be fixed // up once it's done. @@ -135,20 +156,25 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) ++lookups; + void *bp_history = NULL; + if (inst->isUncondCtrl()) { DPRINTF(Fetch, "BranchPred: [tid:%i] Unconditional control.\n", tid); pred_taken = true; + // Tell the BP there was an unconditional branch. + BPUncond(bp_history); } else { ++condPredicted; - pred_taken = BPLookup(PC); + pred_taken = BPLookup(PC, bp_history); DPRINTF(Fetch, "BranchPred: [tid:%i]: Branch predictor predicted %i " "for PC %#x\n", tid, pred_taken, inst->readPC()); } - PredictorHistory predict_record(inst->seqNum, PC, pred_taken, tid); + PredictorHistory predict_record(inst->seqNum, PC, pred_taken, + bp_history, tid); // Now lookup in the BTB or RAS. if (pred_taken) { @@ -189,7 +215,7 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) if (BTB.valid(PC, tid)) { ++BTBHits; - //If it's anything else, use the BTB to get the target addr. + // If it's not a return, use the BTB to get the target addr. target = BTB.lookup(PC, tid); DPRINTF(Fetch, "BranchPred: [tid:%i]: Instruction %#x predicted" @@ -223,7 +249,7 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) template <class Impl> void -TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid) +BPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid) { DPRINTF(Fetch, "BranchPred: [tid:%i]: Commiting branches until sequence" "number %lli.\n", tid, done_sn); @@ -231,8 +257,9 @@ TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid) while (!predHist[tid].empty() && predHist[tid].back().seqNum <= done_sn) { // Update the branch predictor with the correct results. - BP.update(predHist[tid].back().PC, - predHist[tid].back().predTaken); + BPUpdate(predHist[tid].back().PC, + predHist[tid].back().predTaken, + predHist[tid].back().bpHistory); predHist[tid].pop_back(); } @@ -240,13 +267,13 @@ TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid) template <class Impl> void -TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid) +BPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid) { History &pred_hist = predHist[tid]; while (!pred_hist.empty() && pred_hist.front().seqNum > squashed_sn) { - if (pred_hist.front().usedRAS) { + if (pred_hist.front().usedRAS) { DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i," " target: %#x.\n", tid, @@ -257,12 +284,15 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid) pred_hist.front().RASTarget); } else if (pred_hist.front().wasCall) { - DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry added " - "to the RAS.\n",tid); + DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry " + "added to the RAS.\n",tid); RAS[tid].pop(); } + // This call should delete the bpHistory. + BPSquash(pred_hist.front().bpHistory); + pred_hist.pop_front(); } @@ -270,10 +300,10 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid) template <class Impl> void -TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, - const Addr &corr_target, - const bool actually_taken, - unsigned tid) +BPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, + const Addr &corr_target, + const bool actually_taken, + unsigned tid) { // Now that we know that a branch was mispredicted, we need to undo // all the branches that have been seen up until this branch and @@ -287,40 +317,96 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, "setting target to %#x.\n", tid, squashed_sn, corr_target); - while (!pred_hist.empty() && - pred_hist.front().seqNum > squashed_sn) { - if (pred_hist.front().usedRAS) { - DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i, " - "target: %#x.\n", - tid, - pred_hist.front().RASIndex, - pred_hist.front().RASTarget); - - RAS[tid].restore(pred_hist.front().RASIndex, - pred_hist.front().RASTarget); - } else if (pred_hist.front().wasCall) { - DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry" - " added to the RAS.\n", tid); - - RAS[tid].pop(); - } - - pred_hist.pop_front(); - } + squash(squashed_sn, tid); // If there's a squash due to a syscall, there may not be an entry // corresponding to the squash. In that case, don't bother trying to // fix up the entry. if (!pred_hist.empty()) { - pred_hist.front().predTaken = actually_taken; - + assert(pred_hist.front().seqNum == squashed_sn); if (pred_hist.front().usedRAS) { ++RASIncorrect; } - BP.update(pred_hist.front().PC, actually_taken); + BPUpdate(pred_hist.front().PC, actually_taken, + pred_hist.front().bpHistory); BTB.update(pred_hist.front().PC, corr_target, tid); pred_hist.pop_front(); } } + +template <class Impl> +void +BPredUnit<Impl>::BPUncond(void * &bp_history) +{ + // Only the tournament predictor cares about unconditional branches. + if (predictor == Tournament) { + tournamentBP->uncondBr(bp_history); + } +} + +template <class Impl> +void +BPredUnit<Impl>::BPSquash(void *bp_history) +{ + if (predictor == Local) { + localBP->squash(bp_history); + } else if (predictor == Tournament) { + tournamentBP->squash(bp_history); + } else { + panic("Predictor type is unexpected value!"); + } +} + +template <class Impl> +bool +BPredUnit<Impl>::BPLookup(Addr &inst_PC, void * &bp_history) +{ + if (predictor == Local) { + return localBP->lookup(inst_PC, bp_history); + } else if (predictor == Tournament) { + return tournamentBP->lookup(inst_PC, bp_history); + } else { + panic("Predictor type is unexpected value!"); + } +} + +template <class Impl> +void +BPredUnit<Impl>::BPUpdate(Addr &inst_PC, bool taken, void *bp_history) +{ + if (predictor == Local) { + localBP->update(inst_PC, taken, bp_history); + } else if (predictor == Tournament) { + tournamentBP->update(inst_PC, taken, bp_history); + } else { + panic("Predictor type is unexpected value!"); + } +} + +template <class Impl> +void +BPredUnit<Impl>::dump() +{ + typename History::iterator pred_hist_it; + + for (int i = 0; i < Impl::MaxThreads; ++i) { + if (!predHist[i].empty()) { + pred_hist_it = predHist[i].begin(); + + cprintf("predHist[%i].size(): %i\n", i, predHist[i].size()); + + while (pred_hist_it != predHist[i].end()) { + cprintf("[sn:%lli], PC:%#x, tid:%i, predTaken:%i, " + "bpHistory:%#x\n", + (*pred_hist_it).seqNum, (*pred_hist_it).PC, + (*pred_hist_it).tid, (*pred_hist_it).predTaken, + (*pred_hist_it).bpHistory); + pred_hist_it++; + } + + cprintf("\n"); + } + } +} diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index 84ad5f6b1..bf1bd08e8 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -43,6 +43,7 @@ // typedef yet are not templated on the Impl. For now it will be defined here. typedef short int PhysRegIndex; +/** Struct that defines the information passed from fetch to decode. */ template<class Impl> struct DefaultFetchDefaultDecode { typedef typename Impl::DynInstPtr DynInstPtr; @@ -55,6 +56,7 @@ struct DefaultFetchDefaultDecode { bool clearFetchFault; }; +/** Struct that defines the information passed from decode to rename. */ template<class Impl> struct DefaultDecodeDefaultRename { typedef typename Impl::DynInstPtr DynInstPtr; @@ -64,6 +66,7 @@ struct DefaultDecodeDefaultRename { DynInstPtr insts[Impl::MaxWidth]; }; +/** Struct that defines the information passed from rename to IEW. */ template<class Impl> struct DefaultRenameDefaultIEW { typedef typename Impl::DynInstPtr DynInstPtr; @@ -73,6 +76,7 @@ struct DefaultRenameDefaultIEW { DynInstPtr insts[Impl::MaxWidth]; }; +/** Struct that defines the information passed from IEW to commit. */ template<class Impl> struct DefaultIEWDefaultCommit { typedef typename Impl::DynInstPtr DynInstPtr; @@ -100,6 +104,7 @@ struct IssueStruct { DynInstPtr insts[Impl::MaxWidth]; }; +/** Struct that defines all backwards communication. */ template<class Impl> struct TimeBufStruct { struct decodeComm { @@ -121,13 +126,7 @@ struct TimeBufStruct { decodeComm decodeInfo[Impl::MaxThreads]; - // Rename can't actually tell anything to squash or send a new PC back - // because it doesn't do anything along those lines. But maybe leave - // these fields in here to keep the stages mostly orthagonal. struct renameComm { - bool squash; - - uint64_t nextPC; }; renameComm renameInfo[Impl::MaxThreads]; diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index ae2aa2996..eef96b5fd 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -85,6 +85,9 @@ class DefaultCommit typedef O3ThreadState<Impl> Thread; + /** Event class used to schedule a squash due to a trap (fault or + * interrupt) to happen on a specific cycle. + */ class TrapEvent : public Event { private: DefaultCommit<Impl> *commit; @@ -162,7 +165,7 @@ class DefaultCommit Fetch *fetchStage; - /** Sets the poitner to the IEW stage. */ + /** Sets the pointer to the IEW stage. */ void setIEWStage(IEW *iew_stage); /** The pointer to the IEW stage. Used solely to ensure that @@ -183,10 +186,13 @@ class DefaultCommit /** Initializes stage by sending back the number of free entries. */ void initStage(); + /** Initializes the switching out of commit. */ void switchOut(); + /** Completes the switch out of commit. */ void doSwitchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); /** Ticks the commit stage, which tries to commit instructions. */ @@ -200,11 +206,18 @@ class DefaultCommit /** Returns the number of free ROB entries for a specific thread. */ unsigned numROBFreeEntries(unsigned tid); + /** Generates an event to schedule a squash due to a trap. */ + void generateTrapEvent(unsigned tid); + + /** Records that commit needs to initiate a squash due to an + * external state update through the XC. + */ void generateXCEvent(unsigned tid); private: /** Updates the overall status of commit with the nextStatus, and - * tell the CPU if commit is active/inactive. */ + * tell the CPU if commit is active/inactive. + */ void updateStatus(); /** Sets the next status based on threads' statuses, which becomes the @@ -223,10 +236,13 @@ class DefaultCommit */ bool changedROBEntries(); + /** Squashes all in flight instructions. */ void squashAll(unsigned tid); + /** Handles squashing due to a trap. */ void squashFromTrap(unsigned tid); + /** Handles squashing due to an XC write. */ void squashFromXC(unsigned tid); /** Commits as many instructions as possible. */ @@ -237,8 +253,6 @@ class DefaultCommit */ bool commitHead(DynInstPtr &head_inst, unsigned inst_num); - void generateTrapEvent(unsigned tid); - /** Gets instructions from rename and inserts them into the ROB. */ void getInsts(); @@ -260,12 +274,16 @@ class DefaultCommit */ uint64_t readPC() { return PC[0]; } + /** Returns the PC of a specific thread. */ uint64_t readPC(unsigned tid) { return PC[tid]; } + /** Sets the PC of a specific thread. */ void setPC(uint64_t val, unsigned tid) { PC[tid] = val; } + /** Reads the PC of a specific thread. */ uint64_t readNextPC(unsigned tid) { return nextPC[tid]; } + /** Sets the next PC of a specific thread. */ void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; } private: @@ -302,6 +320,7 @@ class DefaultCommit /** Pointer to FullCPU. */ FullCPU *cpu; + /** Vector of all of the threads. */ std::vector<Thread *> thread; Fault fetchFault; @@ -360,17 +379,27 @@ class DefaultCommit /** Number of Active Threads */ unsigned numThreads; + /** Is a switch out pending. */ bool switchPending; + + /** Is commit switched out. */ bool switchedOut; + /** The latency to handle a trap. Used when scheduling trap + * squash event. + */ Tick trapLatency; Tick fetchTrapLatency; Tick fetchFaultTick; + /** The commit PC of each thread. Refers to the instruction that + * is currently being processed/committed. + */ Addr PC[Impl::MaxThreads]; + /** The next PC of each thread. */ Addr nextPC[Impl::MaxThreads]; /** The sequence number of the youngest valid instruction in the ROB. */ @@ -382,6 +411,7 @@ class DefaultCommit /** Rename map interface. */ RenameMap *renameMap[Impl::MaxThreads]; + /** Updates commit stats based on this instruction. */ void updateComInstStats(DynInstPtr &inst); /** Stat for the total number of committed instructions. */ @@ -415,7 +445,9 @@ class DefaultCommit /** Total number of committed branches. */ Stats::Vector<> statComBranches; + /** Number of cycles where the commit bandwidth limit is reached. */ Stats::Scalar<> commitEligibleSamples; + /** Number of instructions not committed due to bandwidth limits. */ Stats::Vector<> commitEligible; }; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 9efe30d24..f8a252b87 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -692,7 +692,7 @@ DefaultCommit<Impl>::commit() while (threads != (*activeThreads).end()) { unsigned tid = *threads++; - +/* if (fromFetch->fetchFault && commitStatus[0] != TrapPending) { // Record the fault. Wait until it's empty in the ROB. // Then handle the trap. Ignore it if there's already a @@ -714,7 +714,7 @@ DefaultCommit<Impl>::commit() commitStatus[0] = Running; } } - +*/ // Not sure which one takes priority. I think if we have // both, that's a bad sign. if (trapSquash[tid] == true) { @@ -926,7 +926,7 @@ DefaultCommit<Impl>::commitInsts() numCommittedDist.sample(num_committed); if (num_committed == commitWidth) { - commitEligible[0]++; + commitEligibleSamples++; } } @@ -948,6 +948,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) head_inst->reachedCommit = true; if (head_inst->isNonSpeculative() || + head_inst->isStoreConditional() || head_inst->isMemBarrier() || head_inst->isWriteBarrier()) { diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 51fcb1adb..c2c5289bf 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -72,6 +72,11 @@ class BaseFullCPU : public BaseCPU int cpu_id; }; +/** + * FullO3CPU class, has each of the stages (fetch through commit) + * within it, as well as all of the time buffers between stages. The + * tick() function for the CPU is defined here. + */ template <class Impl> class FullO3CPU : public BaseFullCPU { @@ -202,17 +207,13 @@ class FullO3CPU : public BaseFullCPU */ virtual void syscall(int tid) { panic("Unimplemented!"); } - /** Check if there are any system calls pending. */ - void checkSyscalls(); - - /** Switches out this CPU. - */ + /** Switches out this CPU. */ void switchOut(Sampler *sampler); + /** Signals to this CPU that a stage has completed switching out. */ void signalSwitched(); - /** Takes over from another CPU. - */ + /** Takes over from another CPU. */ void takeOverFrom(BaseCPU *oldCPU); /** Get the current instruction sequence number, and increment it. */ @@ -244,9 +245,7 @@ class FullO3CPU : public BaseFullCPU #endif - // - // New accessors for new decoder. - // + /** Register accessors. Index refers to the physical register index. */ uint64_t readIntReg(int reg_idx); FloatReg readFloatReg(int reg_idx); @@ -275,6 +274,11 @@ class FullO3CPU : public BaseFullCPU uint64_t readArchFloatRegInt(int reg_idx, unsigned tid); + /** Architectural register accessors. Looks up in the commit + * rename table to obtain the true physical index of the + * architected register first, then accesses that physical + * register. + */ void setArchIntReg(int reg_idx, uint64_t val, unsigned tid); void setArchFloatRegSingle(int reg_idx, float val, unsigned tid); @@ -283,13 +287,17 @@ class FullO3CPU : public BaseFullCPU void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid); + /** Reads the commit PC of a specific thread. */ uint64_t readPC(unsigned tid); - void setPC(Addr new_PC,unsigned tid); + /** Sets the commit PC of a specific thread. */ + void setPC(Addr new_PC, unsigned tid); + /** Reads the next PC of a specific thread. */ uint64_t readNextPC(unsigned tid); - void setNextPC(uint64_t val,unsigned tid); + /** Sets the next PC of a specific thread. */ + void setNextPC(uint64_t val, unsigned tid); /** Function to add instruction onto the head of the list of the * instructions. Used when new instructions are fetched. @@ -313,21 +321,15 @@ class FullO3CPU : public BaseFullCPU /** Remove all instructions younger than the given sequence number. */ void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid); + /** Removes the instruction pointed to by the iterator. */ inline void squashInstIt(const ListIt &instIt, const unsigned &tid); + /** Cleans up all instructions on the remove list. */ void cleanUpRemovedInsts(); - /** Remove all instructions from the list. */ -// void removeAllInsts(); - + /** Debug function to print all instructions on the list. */ void dumpInsts(); - /** Basically a wrapper function so that instructions executed at - * commit can tell the instruction queue that they have - * completed. Eventually this hack should be removed. - */ -// void wakeDependents(DynInstPtr &inst); - public: /** List of all the instructions in flight. */ std::list<DynInstPtr> instList; @@ -338,6 +340,9 @@ class FullO3CPU : public BaseFullCPU std::queue<ListIt> removeList; #ifdef DEBUG + /** Debug structure to keep track of the sequence numbers still in + * flight. + */ std::set<InstSeqNum> snList; #endif @@ -424,14 +429,22 @@ class FullO3CPU : public BaseFullCPU /** The IEW stage's instruction queue. */ TimeBuffer<IEWStruct> iewQueue; - public: + private: + /** The activity recorder; used to tell if the CPU has any + * activity remaining or if it can go to idle and deschedule + * itself. + */ ActivityRecorder activityRec; + public: + /** Records that there was time buffer activity this cycle. */ void activityThisCycle() { activityRec.activity(); } + /** Changes a stage's status to active within the activity recorder. */ void activateStage(const StageIdx idx) { activityRec.activateStage(idx); } + /** Changes a stage's status to inactive within the activity recorder. */ void deactivateStage(const StageIdx idx) { activityRec.deactivateStage(idx); } @@ -442,7 +455,7 @@ class FullO3CPU : public BaseFullCPU int getFreeTid(); public: - /** Temporary function to get pointer to exec context. */ + /** Returns a pointer to a thread's exec context. */ ExecContext *xcBase(unsigned tid) { return thread[tid]->getXCProxy(); @@ -451,6 +464,10 @@ class FullO3CPU : public BaseFullCPU /** The global sequence number counter. */ InstSeqNum globalSeqNum; + /** Pointer to the checker, which can dynamically verify + * instruction results at run time. This can be set to NULL if it + * is not being used. + */ Checker<DynInstPtr> *checker; #if FULL_SYSTEM @@ -466,11 +483,13 @@ class FullO3CPU : public BaseFullCPU /** Pointer to memory. */ MemObject *mem; + /** Pointer to the sampler */ Sampler *sampler; + /** Counter of how many stages have completed switching out. */ int switchCount; - // List of all ExecContexts. + /** Pointers to all of the threads in the CPU. */ std::vector<Thread *> thread; #if 0 diff --git a/src/cpu/o3/cpu_policy.hh b/src/cpu/o3/cpu_policy.hh index 4ea4daee6..32a0adcf1 100644 --- a/src/cpu/o3/cpu_policy.hh +++ b/src/cpu/o3/cpu_policy.hh @@ -50,24 +50,50 @@ #include "cpu/o3/comm.hh" +/** + * Struct that defines the key classes to be used by the CPU. All + * classes use the typedefs defined here to determine what are the + * classes of the other stages and communication buffers. In order to + * change a structure such as the IQ, simply change the typedef here + * to use the desired class instead, and recompile. In order to + * create a different CPU to be used simultaneously with this one, see + * the alpha_impl.hh file for instructions. + */ template<class Impl> struct SimpleCPUPolicy { - typedef TwobitBPredUnit<Impl> BPredUnit; + /** Typedef for the branch prediction unit (which includes the BP, + * RAS, and BTB). + */ + typedef BPredUnit<Impl> BPredUnit; + /** Typedef for the register file. Most classes assume a unified + * physical register file. + */ typedef PhysRegFile<Impl> RegFile; + /** Typedef for the freelist of registers. */ typedef SimpleFreeList FreeList; + /** Typedef for the rename map. */ typedef SimpleRenameMap RenameMap; + /** Typedef for the ROB. */ typedef ROB<Impl> ROB; + /** Typedef for the instruction queue/scheduler. */ typedef InstructionQueue<Impl> IQ; + /** Typedef for the memory dependence unit. */ typedef MemDepUnit<StoreSet, Impl> MemDepUnit; + /** Typedef for the LSQ. */ typedef LSQ<Impl> LSQ; + /** Typedef for the thread-specific LSQ units. */ typedef LSQUnit<Impl> LSQUnit; - + /** Typedef for fetch. */ typedef DefaultFetch<Impl> Fetch; + /** Typedef for decode. */ typedef DefaultDecode<Impl> Decode; + /** Typedef for rename. */ typedef DefaultRename<Impl> Rename; + /** Typedef for Issue/Execute/Writeback. */ typedef DefaultIEW<Impl> IEW; + /** Typedef for commit. */ typedef DefaultCommit<Impl> Commit; /** The struct for communication between fetch and decode. */ diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 8abe1d480..ff88358d6 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -109,9 +109,12 @@ class DefaultDecode /** Sets pointer to list of active threads. */ void setActiveThreads(std::list<unsigned> *at_ptr); + /** Switches out the decode stage. */ void switchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); + /** Ticks decode, processing all input signals and decoding as many * instructions as possible. */ diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh index b03daff11..64b04bc3d 100644 --- a/src/cpu/o3/decode_impl.hh +++ b/src/cpu/o3/decode_impl.hh @@ -43,6 +43,7 @@ DefaultDecode<Impl>::DefaultDecode(Params *params) { _status = Inactive; + // Setup status, make sure stall signals are clear. for (int i = 0; i < numThreads; ++i) { decodeStatus[i] = Idle; @@ -167,6 +168,7 @@ template <class Impl> void DefaultDecode<Impl>::switchOut() { + // Decode can immediately switch out. cpu->signalSwitched(); } @@ -176,6 +178,7 @@ DefaultDecode<Impl>::takeOverFrom() { _status = Inactive; + // Be sure to reset state and clear out any old instructions. for (int i = 0; i < numThreads; ++i) { decodeStatus[i] = Idle; @@ -224,22 +227,22 @@ DefaultDecode<Impl>::block(unsigned tid) { DPRINTF(Decode, "[tid:%u]: Blocking.\n", tid); - // If the decode status is blocked or unblocking then decode has not yet - // signalled fetch to unblock. In that case, there is no need to tell - // fetch to block. - if (decodeStatus[tid] != Blocked && - decodeStatus[tid] != Unblocking) { - toFetch->decodeBlock[tid] = true; - wroteToTimeBuffer = true; - } - // Add the current inputs to the skid buffer so they can be // reprocessed when this stage unblocks. skidInsert(tid); + // If the decode status is blocked or unblocking then decode has not yet + // signalled fetch to unblock. In that case, there is no need to tell + // fetch to block. if (decodeStatus[tid] != Blocked) { // Set the status to Blocked. decodeStatus[tid] = Blocked; + + if (decodeStatus[tid] != Unblocking) { + toFetch->decodeBlock[tid] = true; + wroteToTimeBuffer = true; + } + return true; } @@ -272,13 +275,16 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid) DPRINTF(Decode, "[tid:%i]: Squashing due to incorrect branch prediction " "detected at decode.\n", tid); + // Send back mispredict information. toFetch->decodeInfo[tid].branchMispredict = true; toFetch->decodeInfo[tid].doneSeqNum = inst->seqNum; toFetch->decodeInfo[tid].predIncorrect = true; toFetch->decodeInfo[tid].squash = true; toFetch->decodeInfo[tid].nextPC = inst->readNextPC(); - toFetch->decodeInfo[tid].branchTaken = true; + toFetch->decodeInfo[tid].branchTaken = + inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst)); + // Might have to tell fetch to unblock. if (decodeStatus[tid] == Blocked || decodeStatus[tid] == Unblocking) { toFetch->decodeUnblock[tid] = 1; @@ -294,11 +300,12 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid) } } + // Clear the instruction list and skid buffer in case they have any + // insts in them. while (!insts[tid].empty()) { insts[tid].pop(); } - // Clear the skid buffer in case it has any data in it. while (!skidBuffer[tid].empty()) { skidBuffer[tid].pop(); } @@ -343,11 +350,12 @@ DefaultDecode<Impl>::squash(unsigned tid) } } + // Clear the instruction list and skid buffer in case they have any + // insts in them. while (!insts[tid].empty()) { insts[tid].pop(); } - // Clear the skid buffer in case it has any data in it. while (!skidBuffer[tid].empty()) { skidBuffer[tid].pop(); } @@ -723,6 +731,7 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid) // Might want to set some sort of boolean and just do // a check at the end squash(inst, inst->threadNumber); + inst->setPredTarg(inst->branchTarget()); break; } diff --git a/src/cpu/o3/dep_graph.hh b/src/cpu/o3/dep_graph.hh index f8ae38da4..b6c5f1ab1 100644 --- a/src/cpu/o3/dep_graph.hh +++ b/src/cpu/o3/dep_graph.hh @@ -4,6 +4,7 @@ #include "cpu/o3/comm.hh" +/** Node in a linked list. */ template <class DynInstPtr> class DependencyEntry { @@ -18,32 +19,50 @@ class DependencyEntry DependencyEntry<DynInstPtr> *next; }; +/** Array of linked list that maintains the dependencies between + * producing instructions and consuming instructions. Each linked + * list represents a single physical register, having the future + * producer of the register's value, and all consumers waiting on that + * value on the list. The head node of each linked list represents + * the producing instruction of that register. Instructions are put + * on the list upon reaching the IQ, and are removed from the list + * either when the producer completes, or the instruction is squashed. +*/ template <class DynInstPtr> class DependencyGraph { public: typedef DependencyEntry<DynInstPtr> DepEntry; + /** Default construction. Must call resize() prior to use. */ DependencyGraph() : numEntries(0), memAllocCounter(0), nodesTraversed(0), nodesRemoved(0) { } + /** Resize the dependency graph to have num_entries registers. */ void resize(int num_entries); + /** Clears all of the linked lists. */ void reset(); + /** Inserts an instruction to be dependent on the given index. */ void insert(PhysRegIndex idx, DynInstPtr &new_inst); + /** Sets the producing instruction of a given register. */ void setInst(PhysRegIndex idx, DynInstPtr &new_inst) { dependGraph[idx].inst = new_inst; } + /** Clears the producing instruction. */ void clearInst(PhysRegIndex idx) { dependGraph[idx].inst = NULL; } + /** Removes an instruction from a single linked list. */ void remove(PhysRegIndex idx, DynInstPtr &inst_to_remove); + /** Removes and returns the newest dependent of a specific register. */ DynInstPtr pop(PhysRegIndex idx); + /** Checks if there are any dependents on a specific register. */ bool empty(PhysRegIndex idx) { return !dependGraph[idx].next; } /** Debugging function to dump out the dependency graph. @@ -59,13 +78,16 @@ class DependencyGraph */ DepEntry *dependGraph; + /** Number of linked lists; identical to the number of registers. */ int numEntries; // Debug variable, remove when done testing. unsigned memAllocCounter; public: + // Debug variable, remove when done testing. uint64_t nodesTraversed; + // Debug variable, remove when done testing. uint64_t nodesRemoved; }; diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 3c4fc7d93..23328c534 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -46,7 +46,7 @@ class Sampler; * width is specified by the parameters; each cycle it tries to fetch * that many instructions. It supports using a branch predictor to * predict direction and targets. - * It supports the idling functionalitiy of the CPU by indicating to + * It supports the idling functionality of the CPU by indicating to * the CPU when it is active and inactive. */ template <class Impl> @@ -172,14 +172,19 @@ class DefaultFetch /** Processes cache completion event. */ void processCacheCompletion(PacketPtr pkt); + /** Begins the switch out of the fetch stage. */ void switchOut(); + /** Completes the switch out of the fetch stage. */ void doSwitchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); + /** Checks if the fetch stage is switched out. */ bool isSwitchedOut() { return switchedOut; } + /** Tells fetch to wake up from a quiesce instruction. */ void wakeFromQuiesce(); private: @@ -312,8 +317,10 @@ class DefaultFetch /** BPredUnit. */ BPredUnit branchPred; + /** Per-thread fetch PC. */ Addr PC[Impl::MaxThreads]; + /** Per-thread next PC. */ Addr nextPC[Impl::MaxThreads]; /** Memory packet used to access cache. */ @@ -380,8 +387,12 @@ class DefaultFetch /** Thread ID being fetched. */ int threadFetched; + /** Checks if there is an interrupt pending. If there is, fetch + * must stop once it is not fetching PAL instructions. + */ bool interruptPending; + /** Records if fetch is switched out. */ bool switchedOut; #if !FULL_SYSTEM @@ -405,17 +416,23 @@ class DefaultFetch * the pipeline. */ Stats::Scalar<> fetchIdleCycles; + /** Total number of cycles spent blocked. */ Stats::Scalar<> fetchBlockedCycles; - + /** Total number of cycles spent in any other state. */ Stats::Scalar<> fetchMiscStallCycles; /** Stat for total number of fetched cache lines. */ Stats::Scalar<> fetchedCacheLines; - + /** Total number of outstanding icache accesses that were dropped + * due to a squash. + */ Stats::Scalar<> fetchIcacheSquashes; /** Distribution of number of instructions fetched each cycle. */ Stats::Distribution<> fetchNisnDist; + /** Rate of how often fetch was idle. */ Stats::Formula idleRate; + /** Number of branch fetches per cycle. */ Stats::Formula branchRate; + /** Number of instruction fetched per cycle. */ Stats::Formula fetchRate; }; diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 5d3164dbf..69c43a6a2 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -188,59 +188,59 @@ void DefaultFetch<Impl>::regStats() { icacheStallCycles - .name(name() + ".FETCH:icacheStallCycles") + .name(name() + ".icacheStallCycles") .desc("Number of cycles fetch is stalled on an Icache miss") .prereq(icacheStallCycles); fetchedInsts - .name(name() + ".FETCH:Insts") + .name(name() + ".Insts") .desc("Number of instructions fetch has processed") .prereq(fetchedInsts); fetchedBranches - .name(name() + ".FETCH:Branches") + .name(name() + ".Branches") .desc("Number of branches that fetch encountered") .prereq(fetchedBranches); predictedBranches - .name(name() + ".FETCH:predictedBranches") + .name(name() + ".predictedBranches") .desc("Number of branches that fetch has predicted taken") .prereq(predictedBranches); fetchCycles - .name(name() + ".FETCH:Cycles") + .name(name() + ".Cycles") .desc("Number of cycles fetch has run and was not squashing or" " blocked") .prereq(fetchCycles); fetchSquashCycles - .name(name() + ".FETCH:SquashCycles") + .name(name() + ".SquashCycles") .desc("Number of cycles fetch has spent squashing") .prereq(fetchSquashCycles); fetchIdleCycles - .name(name() + ".FETCH:IdleCycles") + .name(name() + ".IdleCycles") .desc("Number of cycles fetch was idle") .prereq(fetchIdleCycles); fetchBlockedCycles - .name(name() + ".FETCH:BlockedCycles") + .name(name() + ".BlockedCycles") .desc("Number of cycles fetch has spent blocked") .prereq(fetchBlockedCycles); fetchedCacheLines - .name(name() + ".FETCH:CacheLines") + .name(name() + ".CacheLines") .desc("Number of cache lines fetched") .prereq(fetchedCacheLines); fetchMiscStallCycles - .name(name() + ".FETCH:MiscStallCycles") + .name(name() + ".MiscStallCycles") .desc("Number of cycles fetch has spent waiting on interrupts, or " "bad addresses, or out of MSHRs") .prereq(fetchMiscStallCycles); fetchIcacheSquashes - .name(name() + ".FETCH:IcacheSquashes") + .name(name() + ".IcacheSquashes") .desc("Number of outstanding Icache misses that were squashed") .prereq(fetchIcacheSquashes); @@ -248,24 +248,24 @@ DefaultFetch<Impl>::regStats() .init(/* base value */ 0, /* last value */ fetchWidth, /* bucket size */ 1) - .name(name() + ".FETCH:rateDist") + .name(name() + ".rateDist") .desc("Number of instructions fetched each cycle (Total)") .flags(Stats::pdf); idleRate - .name(name() + ".FETCH:idleRate") + .name(name() + ".idleRate") .desc("Percent of cycles fetch was idle") .prereq(idleRate); idleRate = fetchIdleCycles * 100 / cpu->numCycles; branchRate - .name(name() + ".FETCH:branchRate") + .name(name() + ".branchRate") .desc("Number of branch fetches per cycle") .flags(Stats::total); - branchRate = predictedBranches / cpu->numCycles; + branchRate = fetchedBranches / cpu->numCycles; fetchRate - .name(name() + ".FETCH:rate") + .name(name() + ".rate") .desc("Number of inst fetches per cycle") .flags(Stats::total); fetchRate = fetchedInsts / cpu->numCycles; @@ -337,6 +337,7 @@ template<class Impl> void DefaultFetch<Impl>::initStage() { + // Setup PC and nextPC with initial state. for (int tid = 0; tid < numThreads; tid++) { PC[tid] = cpu->readPC(tid); nextPC[tid] = cpu->readNextPC(tid); @@ -353,8 +354,6 @@ DefaultFetch<Impl>::processCacheCompletion(PacketPtr pkt) // Only change the status if it's still waiting on the icache access // to return. - // Can keep track of how many cache accesses go unused due to - // misspeculation here. if (fetchStatus[tid] != IcacheWaitResponse || pkt != memPkt[tid] || isSwitchedOut()) { @@ -391,6 +390,7 @@ template <class Impl> void DefaultFetch<Impl>::switchOut() { + // Fetch is ready to switch out at any time. switchedOut = true; cpu->signalSwitched(); } @@ -399,6 +399,7 @@ template <class Impl> void DefaultFetch<Impl>::doSwitchOut() { + // Branch predictor needs to have its state cleared. branchPred.switchOut(); } @@ -429,6 +430,7 @@ DefaultFetch<Impl>::wakeFromQuiesce() { DPRINTF(Fetch, "Waking up from quiesce\n"); // Hopefully this is safe + // @todo: Allow other threads to wake from quiesce. fetchStatus[0] = Running; } @@ -1213,7 +1215,7 @@ DefaultFetch<Impl>::lsqCount() if (fetchStatus[high_pri] == Running || fetchStatus[high_pri] == IcacheAccessComplete || - fetchStatus[high_pri] == Idle) + fetchStatus[high_pri] == Idle) return high_pri; else PQ.pop(); diff --git a/src/cpu/o3/fu_pool.cc b/src/cpu/o3/fu_pool.cc index fb2b5c00d..b28b5d37f 100644 --- a/src/cpu/o3/fu_pool.cc +++ b/src/cpu/o3/fu_pool.cc @@ -183,6 +183,8 @@ FUPool::getUnit(OpClass capability) } } + assert(fu_idx < numFU); + unitBusy[fu_idx] = true; return fu_idx; diff --git a/src/cpu/o3/fu_pool.hh b/src/cpu/o3/fu_pool.hh index f590c4149..1d4c76690 100644 --- a/src/cpu/o3/fu_pool.hh +++ b/src/cpu/o3/fu_pool.hh @@ -155,7 +155,10 @@ class FUPool : public SimObject return maxIssueLatencies[capability]; } + /** Switches out functional unit pool. */ void switchOut(); + + /** Takes over from another CPU's thread. */ void takeOverFrom(); }; diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index ae86536c9..7e79d5311 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -143,12 +143,16 @@ class DefaultIEW /** Sets pointer to the scoreboard. */ void setScoreboard(Scoreboard *sb_ptr); + /** Starts switch out of IEW stage. */ void switchOut(); + /** Completes switch out of IEW stage. */ void doSwitchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); + /** Returns if IEW is switched out. */ bool isSwitchedOut() { return switchedOut; } /** Sets page table pointer within LSQ. */ @@ -270,6 +274,7 @@ class DefaultIEW void tick(); private: + /** Updates execution stats based on the instruction. */ void updateExeInstStats(DynInstPtr &inst); /** Pointer to main time buffer used for backwards communication. */ @@ -412,6 +417,7 @@ class DefaultIEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; + /** Is this stage switched out. */ bool switchedOut; /** Stat for total number of idle cycles. */ @@ -453,9 +459,13 @@ class DefaultIEW /** Stat for total number of mispredicted branches detected at execute. */ Stats::Formula branchMispredicts; + /** Number of executed software prefetches. */ Stats::Vector<> exeSwp; + /** Number of executed nops. */ Stats::Vector<> exeNop; + /** Number of executed meomory references. */ Stats::Vector<> exeRefs; + /** Number of executed branches. */ Stats::Vector<> exeBranches; // Stats::Vector<> issued_ops; @@ -465,19 +475,30 @@ class DefaultIEW Stats::Vector<> dist_unissued; Stats::Vector2d<> stat_issued_inst_type; */ + /** Number of instructions issued per cycle. */ Stats::Formula issueRate; + /** Number of executed store instructions. */ Stats::Formula iewExecStoreInsts; // Stats::Formula issue_op_rate; // Stats::Formula fu_busy_rate; - + /** Number of instructions sent to commit. */ Stats::Vector<> iewInstsToCommit; + /** Number of instructions that writeback. */ Stats::Vector<> writebackCount; + /** Number of instructions that wake consumers. */ Stats::Vector<> producerInst; + /** Number of instructions that wake up from producers. */ Stats::Vector<> consumerInst; + /** Number of instructions that were delayed in writing back due + * to resource contention. + */ Stats::Vector<> wbPenalized; + /** Number of instructions per cycle written back. */ Stats::Formula wbRate; + /** Average number of woken instructions per writeback. */ Stats::Formula wbFanout; + /** Number of instructions per cycle delayed in writing back . */ Stats::Formula wbPenalizedRate; }; diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 6b61b1b0e..23f101517 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -383,6 +383,7 @@ template <class Impl> void DefaultIEW<Impl>::switchOut() { + // IEW is ready to switch out at any time. cpu->signalSwitched(); } @@ -390,6 +391,7 @@ template <class Impl> void DefaultIEW<Impl>::doSwitchOut() { + // Clear any state. switchedOut = true; instQueue.switchOut(); @@ -408,6 +410,7 @@ template <class Impl> void DefaultIEW<Impl>::takeOverFrom() { + // Reset all state. _status = Active; exeStatus = Running; wbStatus = Idle; @@ -521,6 +524,7 @@ DefaultIEW<Impl>::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid) toCommit->squashedSeqNum[tid] = inst->seqNum; toCommit->nextPC[tid] = inst->readPC(); + // Must include the broadcasted SN in the squash. toCommit->includeSquashInst[tid] = true; ldstQueue.setLoadBlockedHandled(tid); @@ -1054,6 +1058,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid) // Store conditionals need to be set as "canCommit()" // so that commit can process them when they reach the // head of commit. + // @todo: This is somewhat specific to Alpha. inst->setCanCommit(); instQueue.insertNonSpec(inst); add_to_iq = false; @@ -1313,6 +1318,7 @@ DefaultIEW<Impl>::executeInsts() } } + // Update and record activity if we processed any instructions. if (inst_num) { if (exeStatus == Idle) { exeStatus = Running; @@ -1363,8 +1369,10 @@ DefaultIEW<Impl>::writebackInsts() scoreboard->setReg(inst->renamedDestRegIdx(i)); } - producerInst[tid]++; - consumerInst[tid]+= dependents; + if (dependents) { + producerInst[tid]++; + consumerInst[tid]+= dependents; + } writebackCount[tid]++; } } @@ -1435,6 +1443,7 @@ DefaultIEW<Impl>::tick() DPRINTF(IEW,"Processing [tid:%i]\n",tid); + // Update structures based on instructions committed. if (fromCommit->commitInfo[tid].doneSeqNum != 0 && !fromCommit->commitInfo[tid].squash && !fromCommit->commitInfo[tid].robSquashing) { diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index 245601ccf..60a713020 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -94,6 +94,9 @@ class InstructionQueue /** Pointer back to the instruction queue. */ InstructionQueue<Impl> *iqPtr; + /** Should the FU be added to the list to be freed upon + * completing this event. + */ bool freeFU; public: @@ -118,6 +121,7 @@ class InstructionQueue /** Registers statistics. */ void regStats(); + /** Resets all instruction queue state. */ void resetState(); /** Sets CPU pointer. */ @@ -135,10 +139,13 @@ class InstructionQueue /** Sets the global time buffer. */ void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); + /** Switches out the instruction queue. */ void switchOut(); + /** Takes over execution from another CPU's thread. */ void takeOverFrom(); + /** Returns if the IQ is switched out. */ bool isSwitchedOut() { return switchedOut; } /** Number of entries needed for given amount of threads. */ @@ -173,6 +180,9 @@ class InstructionQueue */ void insertBarrier(DynInstPtr &barr_inst); + /** Returns the oldest scheduled instruction, and removes it from + * the list of instructions waiting to execute. + */ DynInstPtr getInstToExecute(); /** @@ -276,13 +286,15 @@ class InstructionQueue /** List of all the instructions in the IQ (some of which may be issued). */ std::list<DynInstPtr> instList[Impl::MaxThreads]; + /** List of instructions that are ready to be executed. */ std::list<DynInstPtr> instsToExecute; /** - * Struct for comparing entries to be added to the priority queue. This - * gives reverse ordering to the instructions in terms of sequence - * numbers: the instructions with smaller sequence numbers (and hence - * are older) will be at the top of the priority queue. + * Struct for comparing entries to be added to the priority queue. + * This gives reverse ordering to the instructions in terms of + * sequence numbers: the instructions with smaller sequence + * numbers (and hence are older) will be at the top of the + * priority queue. */ struct pqCompare { bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const @@ -395,6 +407,7 @@ class InstructionQueue */ unsigned commitToIEWDelay; + /** Is the IQ switched out. */ bool switchedOut; /** The sequence number of the squashed instruction. */ @@ -462,19 +475,28 @@ class InstructionQueue */ Stats::Scalar<> iqSquashedNonSpecRemoved; + /** Distribution of number of instructions in the queue. */ Stats::VectorDistribution<> queueResDist; + /** Distribution of the number of instructions issued. */ Stats::Distribution<> numIssuedDist; + /** Distribution of the cycles it takes to issue an instruction. */ Stats::VectorDistribution<> issueDelayDist; + /** Number of times an instruction could not be issued because a + * FU was busy. + */ Stats::Vector<> statFuBusy; // Stats::Vector<> dist_unissued; + /** Stat for total number issued for each instruction type. */ Stats::Vector2d<> statIssuedInstType; + /** Number of instructions issued per cycle. */ Stats::Formula issueRate; // Stats::Formula issue_stores; // Stats::Formula issue_op_rate; - Stats::Vector<> fuBusy; //cumulative fu busy - + /** Number of times the FU was busy. */ + Stats::Vector<> fuBusy; + /** Number of times the FU was busy per instruction issued. */ Stats::Formula fuBusyRate; }; diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index 34af8c641..2f03c6814 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -152,8 +152,10 @@ template <class Impl> InstructionQueue<Impl>::~InstructionQueue() { dependGraph.reset(); +#ifdef DEBUG cprintf("Nodes traversed: %i, removed: %i\n", dependGraph.nodesTraversed, dependGraph.nodesRemoved); +#endif } template <class Impl> @@ -670,14 +672,8 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx) // @todo: Ensure that these FU Completions happen at the beginning // of a cycle, otherwise they could add too many instructions to // the queue. - // @todo: This could break if there's multiple multi-cycle ops - // finishing on this cycle. Maybe implement something like - // instToCommit in iew_impl.hh. issueToExecuteQueue->access(0)->size++; instsToExecute.push_back(inst); -// int &size = issueToExecuteQueue->access(0)->size; - -// issueToExecuteQueue->access(0)->insts[size++] = inst; } // @todo: Figure out a better way to remove the squashed items from the @@ -743,9 +739,10 @@ InstructionQueue<Impl>::scheduleReadyInsts() } } + // If we have an instruction that doesn't require a FU, or a + // valid FU, then schedule for execution. if (idx == -2 || idx != -1) { if (op_latency == 1) { -// i2e_info->insts[exec_queue_slot++] = issuing_inst; i2e_info->size++; instsToExecute.push_back(issuing_inst); @@ -763,14 +760,10 @@ InstructionQueue<Impl>::scheduleReadyInsts() // @todo: Enforce that issue_latency == 1 or op_latency if (issue_latency > 1) { + // If FU isn't pipelined, then it must be freed + // upon the execution completing. execution->setFreeFU(); } else { - // @todo: Not sure I'm accounting for the - // multi-cycle op in a pipelined FU properly, or - // the number of instructions issued in one cycle. -// i2e_info->insts[exec_queue_slot++] = issuing_inst; -// i2e_info->size++; - // Add the FU onto the list of FU's to be freed next cycle. fuPool->freeUnitNextCycle(idx); } @@ -815,6 +808,7 @@ InstructionQueue<Impl>::scheduleReadyInsts() numIssuedDist.sample(total_issued); iqInstsIssued+= total_issued; + // If we issued any instructions, tell the CPU we had activity. if (total_issued) { cpu->activityThisCycle(); } else { @@ -1365,4 +1359,45 @@ InstructionQueue<Impl>::dumpInsts() ++num; } } + + cprintf("Insts to Execute list:\n"); + + int num = 0; + int valid_num = 0; + ListIt inst_list_it = instsToExecute.begin(); + + while (inst_list_it != instsToExecute.end()) + { + cprintf("Instruction:%i\n", + num); + if (!(*inst_list_it)->isSquashed()) { + if (!(*inst_list_it)->isIssued()) { + ++valid_num; + cprintf("Count:%i\n", valid_num); + } else if ((*inst_list_it)->isMemRef() && + !(*inst_list_it)->memOpDone) { + // Loads that have not been marked as executed + // still count towards the total instructions. + ++valid_num; + cprintf("Count:%i\n", valid_num); + } + } + + cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n" + "Issued:%i\nSquashed:%i\n", + (*inst_list_it)->readPC(), + (*inst_list_it)->seqNum, + (*inst_list_it)->threadNumber, + (*inst_list_it)->isIssued(), + (*inst_list_it)->isSquashed()); + + if ((*inst_list_it)->isMemRef()) { + cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone); + } + + cprintf("\n"); + + inst_list_it++; + ++num; + } } diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 51eb23cd7..d65510c30 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -49,6 +49,7 @@ class LSQ { typedef typename Impl::CPUPol::IEW IEW; typedef typename Impl::CPUPol::LSQUnit LSQUnit; + /** SMT policy. */ enum LSQPolicy { Dynamic, Partitioned, @@ -69,8 +70,9 @@ class LSQ { void setIEW(IEW *iew_ptr); /** Sets the page table pointer. */ // void setPageTable(PageTable *pt_ptr); - + /** Switches out the LSQ. */ void switchOut(); + /** Takes over execution from another CPU's thread. */ void takeOverFrom(); /** Number of entries needed for the given amount of threads.*/ @@ -95,9 +97,6 @@ class LSQ { /** Executes a load. */ Fault executeLoad(DynInstPtr &inst); - Fault executeLoad(int lq_idx, unsigned tid) - { return thread[tid].executeLoad(lq_idx); } - /** Executes a store. */ Fault executeStore(DynInstPtr &inst); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index b339cea2c..393d8947d 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -87,10 +87,13 @@ class LSQUnit { /** Sets the page table pointer. */ // void setPageTable(PageTable *pt_ptr); + /** Switches out LSQ unit. */ void switchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); + /** Returns if the LSQ is switched out. */ bool isSwitchedOut() { return switchedOut; } /** Ticks the LSQ unit, which in this case only resets the number of @@ -159,12 +162,15 @@ class LSQUnit { bool loadBlocked() { return isLoadBlocked; } + /** Clears the signal that a load became blocked. */ void clearLoadBlocked() { isLoadBlocked = false; } + /** Returns if the blocked load was handled. */ bool isLoadBlockedHandled() { return loadBlockedHandled; } + /** Records the blocked load as being handled. */ void setLoadBlockedHandled() { loadBlockedHandled = true; } @@ -339,6 +345,7 @@ class LSQUnit { /** The number of used cache ports in this cycle. */ int usedPorts; + /** Is the LSQ switched out. */ bool switchedOut; //list<InstSeqNum> mshrSeqNums; @@ -358,8 +365,10 @@ class LSQUnit { /** Whether or not a load is blocked due to the memory system. */ bool isLoadBlocked; + /** Has the blocked load been handled. */ bool loadBlockedHandled; + /** The sequence number of the blocked load. */ InstSeqNum blockedLoadSeqNum; /** The oldest load that caused a memory ordering violation. */ diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 3f6af3d2c..1ad561dc0 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -78,8 +78,12 @@ LSQUnit<Impl>::completeStoreDataAccess(DynInstPtr &inst) //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum); - if (lsqPtr->isSwitchedOut()) + if (lsqPtr->isSwitchedOut()) { + if (wbEvent) + delete wbEvent; + return; + } lsqPtr->cpu->wakeCPU(); @@ -500,7 +504,6 @@ LSQUnit<Impl>::commitLoad() DPRINTF(LSQUnit, "Committing head load instruction, PC %#x\n", loadQueue[loadHead]->readPC()); - loadQueue[loadHead] = NULL; incrLdIdx(loadHead); diff --git a/src/cpu/o3/mem_dep_unit.hh b/src/cpu/o3/mem_dep_unit.hh index 4d763fca2..e399f0133 100644 --- a/src/cpu/o3/mem_dep_unit.hh +++ b/src/cpu/o3/mem_dep_unit.hh @@ -86,8 +86,10 @@ class MemDepUnit { /** Registers statistics. */ void regStats(); + /** Switches out the memory dependence predictor. */ void switchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); /** Sets the pointer to the IQ. */ @@ -157,10 +159,12 @@ class MemDepUnit { : inst(new_inst), regsReady(false), memDepReady(false), completed(false), squashed(false) { +#ifdef DEBUG ++memdep_count; DPRINTF(MemDepUnit, "Memory dependency entry created. " "memdep_count=%i\n", memdep_count); +#endif } /** Frees any pointers. */ @@ -169,11 +173,12 @@ class MemDepUnit { for (int i = 0; i < dependInsts.size(); ++i) { dependInsts[i] = NULL; } - +#ifdef DEBUG --memdep_count; DPRINTF(MemDepUnit, "Memory dependency entry deleted. " "memdep_count=%i\n", memdep_count); +#endif } /** Returns the name of the memory dependence entry. */ @@ -198,9 +203,11 @@ class MemDepUnit { bool squashed; /** For debugging. */ +#ifdef DEBUG static int memdep_count; static int memdep_insert; static int memdep_erase; +#endif }; /** Finds the memory dependence entry in the hash map. */ @@ -229,9 +236,13 @@ class MemDepUnit { */ MemDepPred depPred; + /** Is there an outstanding load barrier that loads must wait on. */ bool loadBarrier; + /** The sequence number of the load barrier. */ InstSeqNum loadBarrierSN; + /** Is there an outstanding store barrier that loads must wait on. */ bool storeBarrier; + /** The sequence number of the store barrier. */ InstSeqNum storeBarrierSN; /** Pointer to the IQ. */ diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh index a4fed4b0d..50ad1e2c8 100644 --- a/src/cpu/o3/mem_dep_unit_impl.hh +++ b/src/cpu/o3/mem_dep_unit_impl.hh @@ -107,6 +107,7 @@ template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::switchOut() { + // Clear any state. for (int i = 0; i < Impl::MaxThreads; ++i) { instList[i].clear(); } @@ -118,6 +119,7 @@ template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::takeOverFrom() { + // Be sure to reset all state. loadBarrier = storeBarrier = false; loadBarrierSN = storeBarrierSN = 0; depPred.clear(); @@ -148,7 +150,7 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst) inst_entry->listIt = --(instList[tid].end()); // Check any barriers and the dependence predictor for any - // producing stores. + // producing memrefs/stores. InstSeqNum producing_store; if (inst->isLoad() && loadBarrier) { producing_store = loadBarrierSN; @@ -255,6 +257,7 @@ void MemDepUnit<MemDepPred, Impl>::insertBarrier(DynInstPtr &barr_inst) { InstSeqNum barr_sn = barr_inst->seqNum; + // Memory barriers block loads and stores, write barriers only stores. if (barr_inst->isMemBarrier()) { loadBarrier = true; loadBarrierSN = barr_sn; @@ -332,6 +335,7 @@ MemDepUnit<MemDepPred, Impl>::replay(DynInstPtr &inst) DynInstPtr temp_inst; bool found_inst = false; + // For now this replay function replays all waiting memory ops. while (!instsToReplay.empty()) { temp_inst = instsToReplay.front(); diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 626d7cc75..42fdf6bf5 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -157,10 +157,13 @@ class DefaultRename /** Sets pointer to the scoreboard. */ void setScoreboard(Scoreboard *_scoreboard); + /** Switches out the rename stage. */ void switchOut(); + /** Completes the switch out. */ void doSwitchOut(); + /** Takes over from another CPU's thread. */ void takeOverFrom(); /** Squashes all instructions in a thread. */ @@ -245,8 +248,10 @@ class DefaultRename /** Checks if any stages are telling rename to block. */ bool checkStall(unsigned tid); + /** Gets the number of free entries for a specific thread. */ void readFreeEntries(unsigned tid); + /** Checks the signals and updates the status. */ bool checkSignalsAndUpdate(unsigned tid); /** Either serializes on the next instruction available in the InstQueue, @@ -456,8 +461,11 @@ class DefaultRename Stats::Scalar<> renameCommittedMaps; /** Stat for total number of mappings that were undone due to a squash. */ Stats::Scalar<> renameUndoneMaps; + /** Number of serialize instructions handled. */ Stats::Scalar<> renamedSerializing; + /** Number of instructions marked as temporarily serializing. */ Stats::Scalar<> renamedTempSerializing; + /** Number of instructions inserted into skid buffers. */ Stats::Scalar<> renameSkidInsts; }; diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 9cbe1a770..8e70c90f7 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -260,6 +260,7 @@ template <class Impl> void DefaultRename<Impl>::switchOut() { + // Rename is ready to switch out at any time. cpu->signalSwitched(); } @@ -267,6 +268,7 @@ template <class Impl> void DefaultRename<Impl>::doSwitchOut() { + // Clear any state, fix up the rename map. for (int i = 0; i < numThreads; i++) { typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin(); diff --git a/src/cpu/o3/rename_map.hh b/src/cpu/o3/rename_map.hh index 0edb80684..c4c90c99a 100644 --- a/src/cpu/o3/rename_map.hh +++ b/src/cpu/o3/rename_map.hh @@ -64,12 +64,13 @@ class SimpleRenameMap typedef std::pair<PhysRegIndex, PhysRegIndex> RenameInfo; public: - //Constructor - SimpleRenameMap() {}; + /** Default constructor. init() must be called prior to use. */ + SimpleRenameMap() {}; /** Destructor. */ ~SimpleRenameMap(); + /** Initializes rename map with given parameters. */ void init(unsigned _numLogicalIntRegs, unsigned _numPhysicalIntRegs, PhysRegIndex &_int_reg_start, @@ -86,6 +87,7 @@ class SimpleRenameMap int id, bool bindRegs); + /** Sets the free list used with this rename map. */ void setFreeList(SimpleFreeList *fl_ptr); //Tell rename map to get a free physical register for a given @@ -151,7 +153,6 @@ class SimpleRenameMap { } }; - //Change this to private private: /** Integer rename map. */ std::vector<RenameEntry> intRenameMap; diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index 3786a0355..6d1402531 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -97,8 +97,10 @@ class ROB */ void setActiveThreads(std::list<unsigned>* at_ptr); + /** Switches out the ROB. */ void switchOut(); + /** Takes over another CPU's thread. */ void takeOverFrom(); /** Function to insert an instruction into the ROB. Note that whatever @@ -300,6 +302,7 @@ class ROB /** Number of instructions in the ROB. */ int numInstsInROB; + /** Dummy instruction returned if there are no insts left. */ DynInstPtr dummyInst; private: diff --git a/src/cpu/o3/store_set.cc b/src/cpu/o3/store_set.cc index 720d5da53..0023cee36 100644 --- a/src/cpu/o3/store_set.cc +++ b/src/cpu/o3/store_set.cc @@ -28,6 +28,7 @@ * Authors: Kevin Lim */ +#include "base/intmath.hh" #include "base/trace.hh" #include "cpu/o3/store_set.hh" @@ -38,6 +39,10 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size) DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n", SSITSize, LFSTSize); + if (!isPowerOf2(SSITSize)) { + fatal("Invalid SSIT size!\n"); + } + SSIT.resize(SSITSize); validSSIT.resize(SSITSize); @@ -45,6 +50,10 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size) for (int i = 0; i < SSITSize; ++i) validSSIT[i] = false; + if (!isPowerOf2(LFSTSize)) { + fatal("Invalid LFST size!\n"); + } + LFST.resize(LFSTSize); validLFST.resize(LFSTSize); @@ -320,3 +329,19 @@ StoreSet::clear() storeList.clear(); } + +void +StoreSet::dump() +{ + cprintf("storeList.size(): %i\n", storeList.size()); + SeqNumMapIt store_list_it = storeList.begin(); + + int num = 0; + + while (store_list_it != storeList.end()) { + cprintf("%i: [sn:%lli] SSID:%i\n", + num, (*store_list_it).first, (*store_list_it).second); + num++; + store_list_it++; + } +} diff --git a/src/cpu/o3/store_set.hh b/src/cpu/o3/store_set.hh index 64255c51a..f5a44a1ac 100644 --- a/src/cpu/o3/store_set.hh +++ b/src/cpu/o3/store_set.hh @@ -46,58 +46,98 @@ struct ltseqnum { } }; +/** + * Implements a store set predictor for determining if memory + * instructions are dependent upon each other. See paper "Memory + * Dependence Prediction using Store Sets" by Chrysos and Emer. SSID + * stands for Store Set ID, SSIT stands for Store Set ID Table, and + * LFST is Last Fetched Store Table. + */ class StoreSet { public: typedef unsigned SSID; public: + /** Default constructor. init() must be called prior to use. */ StoreSet() { }; + /** Creates store set predictor with given table sizes. */ StoreSet(int SSIT_size, int LFST_size); + /** Default destructor. */ ~StoreSet(); + /** Initializes the store set predictor with the given table sizes. */ void init(int SSIT_size, int LFST_size); + /** Records a memory ordering violation between the younger load + * and the older store. */ void violation(Addr store_PC, Addr load_PC); + /** Inserts a load into the store set predictor. This does nothing but + * is included in case other predictors require a similar function. + */ void insertLoad(Addr load_PC, InstSeqNum load_seq_num); + /** Inserts a store into the store set predictor. Updates the + * LFST if the store has a valid SSID. */ void insertStore(Addr store_PC, InstSeqNum store_seq_num, unsigned tid); + /** Checks if the instruction with the given PC is dependent upon + * any store. @return Returns the sequence number of the store + * instruction this PC is dependent upon. Returns 0 if none. + */ InstSeqNum checkInst(Addr PC); + /** Records this PC/sequence number as issued. */ void issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store); + /** Squashes for a specific thread until the given sequence number. */ void squash(InstSeqNum squashed_num, unsigned tid); + /** Resets all tables. */ void clear(); + /** Debug function to dump the contents of the store list. */ + void dump(); + private: + /** Calculates the index into the SSIT based on the PC. */ inline int calcIndex(Addr PC) { return (PC >> offsetBits) & indexMask; } + /** Calculates a Store Set ID based on the PC. */ inline SSID calcSSID(Addr PC) { return ((PC ^ (PC >> 10)) % LFSTSize); } + /** The Store Set ID Table. */ std::vector<SSID> SSIT; + /** Bit vector to tell if the SSIT has a valid entry. */ std::vector<bool> validSSIT; + /** Last Fetched Store Table. */ std::vector<InstSeqNum> LFST; + /** Bit vector to tell if the LFST has a valid entry. */ std::vector<bool> validLFST; + /** Map of stores that have been inserted into the store set, but + * not yet issued or squashed. + */ std::map<InstSeqNum, int, ltseqnum> storeList; typedef std::map<InstSeqNum, int, ltseqnum>::iterator SeqNumMapIt; + /** Store Set ID Table size, in entries. */ int SSITSize; + /** Last Fetched Store Table size, in entries. */ int LFSTSize; + /** Mask to obtain the index. */ int indexMask; // HACK: Hardcoded for now. diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh index 9101eafb9..dfb1530d0 100644 --- a/src/cpu/o3/thread_state.hh +++ b/src/cpu/o3/thread_state.hh @@ -58,16 +58,26 @@ struct O3ThreadState : public ThreadState { typedef ExecContext::Status Status; typedef typename Impl::FullCPU FullCPU; + /** Current status of the thread. */ Status _status; - // Current instruction + /** Current instruction the thread is committing. Only set and + * used for DTB faults currently. + */ TheISA::MachInst inst; + private: + /** Pointer to the CPU. */ FullCPU *cpu; public: - + /** Whether or not the thread is currently in syscall mode, and + * thus able to be externally updated without squashing. + */ bool inSyscall; + /** Whether or not the thread is currently waiting on a trap, and + * thus able to be externally updated without squashing. + */ bool trapPending; #if FULL_SYSTEM @@ -88,23 +98,34 @@ struct O3ThreadState : public ThreadState { { } #endif + /** Pointer to the ExecContext of this thread. @todo: Don't call + this a proxy.*/ ExecContext *xcProxy; + /** Returns a pointer to the XC of this thread. */ ExecContext *getXCProxy() { return xcProxy; } + /** Returns the status of this thread. */ Status status() const { return _status; } + /** Sets the status of this thread. */ void setStatus(Status new_status) { _status = new_status; } - bool misspeculating() { return false; } - + /** Sets the current instruction being committed. */ void setInst(TheISA::MachInst _inst) { inst = _inst; } + /** Reads the number of instructions functionally executed and + * committed. + */ Counter readFuncExeInst() { return funcExeInst; } + /** Sets the total number of instructions functionally executed + * and committed. + */ void setFuncExeInst(Counter new_val) { funcExeInst = new_val; } #if !FULL_SYSTEM + /** Handles the syscall. */ void syscall(int64_t callnum) { process->syscall(callnum, xcProxy); } #endif }; diff --git a/src/cpu/o3/tournament_pred.cc b/src/cpu/o3/tournament_pred.cc index 361ef4770..7cf78dcb1 100644 --- a/src/cpu/o3/tournament_pred.cc +++ b/src/cpu/o3/tournament_pred.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Regents of The University of Michigan + * Copyright (c) 2004-2006 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ * Authors: Kevin Lim */ +#include "base/intmath.hh" #include "cpu/o3/tournament_pred.hh" TournamentBP::TournamentBP(unsigned _localPredictorSize, @@ -51,7 +52,9 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, choiceCtrBits(_choiceCtrBits), instShiftAmt(_instShiftAmt) { - //Should do checks here to make sure sizes are correct (powers of 2) + if (!isPowerOf2(localPredictorSize)) { + fatal("Invalid local predictor size!\n"); + } //Setup the array of counters for the local predictor localCtrs.resize(localPredictorSize); @@ -59,6 +62,10 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, for (int i = 0; i < localPredictorSize; ++i) localCtrs[i].setBits(localCtrBits); + if (!isPowerOf2(localHistoryTableSize)) { + fatal("Invalid local history table size!\n"); + } + //Setup the history table for the local table localHistoryTable.resize(localHistoryTableSize); @@ -68,6 +75,10 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, // Setup the local history mask localHistoryMask = (1 << localHistoryBits) - 1; + if (!isPowerOf2(globalPredictorSize)) { + fatal("Invalid global predictor size!\n"); + } + //Setup the array of counters for the global predictor globalCtrs.resize(globalPredictorSize); @@ -79,12 +90,17 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, // Setup the global history mask globalHistoryMask = (1 << globalHistoryBits) - 1; + if (!isPowerOf2(choicePredictorSize)) { + fatal("Invalid choice predictor size!\n"); + } + //Setup the array of counters for the choice predictor choiceCtrs.resize(choicePredictorSize); for (int i = 0; i < choicePredictorSize; ++i) choiceCtrs[i].setBits(choiceCtrBits); + // @todo: Allow for different thresholds between the predictors. threshold = (1 << (localCtrBits - 1)) - 1; threshold = threshold / 2; } @@ -93,165 +109,185 @@ inline unsigned TournamentBP::calcLocHistIdx(Addr &branch_addr) { + // Get low order bits after removing instruction offset. return (branch_addr >> instShiftAmt) & (localHistoryTableSize - 1); } inline void -TournamentBP::updateHistoriesTaken(unsigned local_history_idx) +TournamentBP::updateGlobalHistTaken() { globalHistory = (globalHistory << 1) | 1; globalHistory = globalHistory & globalHistoryMask; - - localHistoryTable[local_history_idx] = - (localHistoryTable[local_history_idx] << 1) | 1; } inline void -TournamentBP::updateHistoriesNotTaken(unsigned local_history_idx) +TournamentBP::updateGlobalHistNotTaken() { globalHistory = (globalHistory << 1); globalHistory = globalHistory & globalHistoryMask; +} +inline +void +TournamentBP::updateLocalHistTaken(unsigned local_history_idx) +{ + localHistoryTable[local_history_idx] = + (localHistoryTable[local_history_idx] << 1) | 1; +} + +inline +void +TournamentBP::updateLocalHistNotTaken(unsigned local_history_idx) +{ localHistoryTable[local_history_idx] = (localHistoryTable[local_history_idx] << 1); } bool -TournamentBP::lookup(Addr &branch_addr) +TournamentBP::lookup(Addr &branch_addr, void * &bp_history) { - uint8_t local_prediction; + bool local_prediction; unsigned local_history_idx; unsigned local_predictor_idx; - uint8_t global_prediction; - uint8_t choice_prediction; + bool global_prediction; + bool choice_prediction; //Lookup in the local predictor to get its branch prediction local_history_idx = calcLocHistIdx(branch_addr); local_predictor_idx = localHistoryTable[local_history_idx] & localHistoryMask; - local_prediction = localCtrs[local_predictor_idx].read(); + local_prediction = localCtrs[local_predictor_idx].read() > threshold; //Lookup in the global predictor to get its branch prediction - global_prediction = globalCtrs[globalHistory].read(); + global_prediction = globalCtrs[globalHistory].read() > threshold; //Lookup in the choice predictor to see which one to use - choice_prediction = choiceCtrs[globalHistory].read(); - - //@todo Put a threshold value in for the three predictors that can - // be set through the constructor (so this isn't hard coded). - //Also should put some of this code into functions. - if (choice_prediction > threshold) { - if (global_prediction > threshold) { - updateHistoriesTaken(local_history_idx); - - assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); - - globalCtrs[globalHistory].increment(); - localCtrs[local_history_idx].increment(); - + choice_prediction = choiceCtrs[globalHistory].read() > threshold; + + // Create BPHistory and pass it back to be recorded. + BPHistory *history = new BPHistory; + history->globalHistory = globalHistory; + history->localPredTaken = local_prediction; + history->globalPredTaken = global_prediction; + history->globalUsed = choice_prediction; + bp_history = (void *)history; + + assert(globalHistory < globalPredictorSize && + local_history_idx < localPredictorSize); + + // Commented code is for doing speculative update of counters and + // all histories. + if (choice_prediction) { + if (global_prediction) { +// updateHistoriesTaken(local_history_idx); +// globalCtrs[globalHistory].increment(); +// localCtrs[local_history_idx].increment(); + updateGlobalHistTaken(); return true; } else { - updateHistoriesNotTaken(local_history_idx); - - assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); - - globalCtrs[globalHistory].decrement(); - localCtrs[local_history_idx].decrement(); - +// updateHistoriesNotTaken(local_history_idx); +// globalCtrs[globalHistory].decrement(); +// localCtrs[local_history_idx].decrement(); + updateGlobalHistNotTaken(); return false; } } else { - if (local_prediction > threshold) { - updateHistoriesTaken(local_history_idx); - - assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); - - globalCtrs[globalHistory].increment(); - localCtrs[local_history_idx].increment(); - + if (local_prediction) { +// updateHistoriesTaken(local_history_idx); +// globalCtrs[globalHistory].increment(); +// localCtrs[local_history_idx].increment(); + updateGlobalHistTaken(); return true; } else { - updateHistoriesNotTaken(local_history_idx); - - assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); - - globalCtrs[globalHistory].decrement(); - localCtrs[local_history_idx].decrement(); - +// updateHistoriesNotTaken(local_history_idx); +// globalCtrs[globalHistory].decrement(); +// localCtrs[local_history_idx].decrement(); + updateGlobalHistNotTaken(); return false; } } } -// Update the branch predictor if it predicted a branch wrong. void -TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken) +TournamentBP::uncondBr(void * &bp_history) { + // Create BPHistory and pass it back to be recorded. + BPHistory *history = new BPHistory; + history->globalHistory = globalHistory; + history->localPredTaken = true; + history->globalPredTaken = true; + bp_history = static_cast<void *>(history); + + updateGlobalHistTaken(); +} - uint8_t local_prediction; +void +TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history) +{ unsigned local_history_idx; unsigned local_predictor_idx; - bool local_pred_taken; + unsigned local_predictor_hist; - uint8_t global_prediction; - bool global_pred_taken; - - // Load the correct global history into the register. - globalHistory = correct_gh; - - // Get the local predictor's current prediction, remove the incorrect - // update, and update the local predictor + // Get the local predictor's current prediction local_history_idx = calcLocHistIdx(branch_addr); - local_predictor_idx = localHistoryTable[local_history_idx]; - local_predictor_idx = (local_predictor_idx >> 1) & localHistoryMask; - - local_prediction = localCtrs[local_predictor_idx].read(); - local_pred_taken = local_prediction > threshold; - - //Get the global predictor's current prediction, and update the - //global predictor - global_prediction = globalCtrs[globalHistory].read(); - global_pred_taken = global_prediction > threshold; - - //Update the choice predictor to tell it which one was correct - if (local_pred_taken != global_pred_taken) { - //If the local prediction matches the actual outcome, decerement - //the counter. Otherwise increment the counter. - if (local_pred_taken == taken) { - choiceCtrs[globalHistory].decrement(); - } else { - choiceCtrs[globalHistory].increment(); + local_predictor_hist = localHistoryTable[local_history_idx]; + local_predictor_idx = local_predictor_hist & localHistoryMask; + + // Update the choice predictor to tell it which one was correct if + // there was a prediction. + if (bp_history) { + BPHistory *history = static_cast<BPHistory *>(bp_history); + if (history->localPredTaken != history->globalPredTaken) { + // If the local prediction matches the actual outcome, + // decerement the counter. Otherwise increment the + // counter. + if (history->localPredTaken == taken) { + choiceCtrs[globalHistory].decrement(); + } else if (history->globalPredTaken == taken){ + choiceCtrs[globalHistory].increment(); + } } + + // We're done with this history, now delete it. + delete history; } - if (taken) { - assert(globalHistory < globalPredictorSize && - local_predictor_idx < localPredictorSize); + assert(globalHistory < globalPredictorSize && + local_predictor_idx < localPredictorSize); + // Update the counters and local history with the proper + // resolution of the branch. Global history is updated + // speculatively and restored upon squash() calls, so it does not + // need to be updated. + if (taken) { localCtrs[local_predictor_idx].increment(); globalCtrs[globalHistory].increment(); - globalHistory = (globalHistory << 1) | 1; - globalHistory = globalHistory & globalHistoryMask; - - localHistoryTable[local_history_idx] |= 1; + updateLocalHistTaken(local_history_idx); } else { - assert(globalHistory < globalPredictorSize && - local_predictor_idx < localPredictorSize); - localCtrs[local_predictor_idx].decrement(); globalCtrs[globalHistory].decrement(); - globalHistory = (globalHistory << 1); - globalHistory = globalHistory & globalHistoryMask; - - localHistoryTable[local_history_idx] &= ~1; + updateLocalHistNotTaken(local_history_idx); } } + +void +TournamentBP::squash(void *bp_history) +{ + BPHistory *history = static_cast<BPHistory *>(bp_history); + + // Restore global history to state prior to this branch. + globalHistory = history->globalHistory; + + // Delete this BPHistory now that we're done with it. + delete history; +} + +#ifdef DEBUG +int +TournamentBP::BPHistory::newCount = 0; +#endif diff --git a/src/cpu/o3/tournament_pred.hh b/src/cpu/o3/tournament_pred.hh index e16600090..92402adc6 100644 --- a/src/cpu/o3/tournament_pred.hh +++ b/src/cpu/o3/tournament_pred.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Regents of The University of Michigan + * Copyright (c) 2004-2006 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,6 +36,15 @@ #include "cpu/o3/sat_counter.hh" #include <vector> +/** + * Implements a tournament branch predictor, hopefully identical to the one + * used in the 21264. It has a local predictor, which uses a local history + * table to index into a table of counters, and a global predictor, which + * uses a global history to index into a table of counters. A choice + * predictor chooses between the two. Only the global history register + * is speculatively updated, the rest are updated upon branches committing + * or misspeculating. + */ class TournamentBP { public: @@ -55,30 +64,95 @@ class TournamentBP /** * Looks up the given address in the branch predictor and returns - * a true/false value as to whether it is taken. + * a true/false value as to whether it is taken. Also creates a + * BPHistory object to store any state it will need on squash/update. * @param branch_addr The address of the branch to look up. + * @param bp_history Pointer that will be set to the BPHistory object. * @return Whether or not the branch is taken. */ - bool lookup(Addr &branch_addr); + bool lookup(Addr &branch_addr, void * &bp_history); + + /** + * Records that there was an unconditional branch, and modifies + * the bp history to point to an object that has the previous + * global history stored in it. + * @param bp_history Pointer that will be set to the BPHistory object. + */ + void uncondBr(void * &bp_history); /** * Updates the branch predictor with the actual result of a branch. * @param branch_addr The address of the branch to update. * @param taken Whether or not the branch was taken. + * @param bp_history Pointer to the BPHistory object that was created + * when the branch was predicted. + */ + void update(Addr &branch_addr, bool taken, void *bp_history); + + /** + * Restores the global branch history on a squash. + * @param bp_history Pointer to the BPHistory object that has the + * previous global branch history in it. */ - void update(Addr &branch_addr, unsigned global_history, bool taken); + void squash(void *bp_history); + /** Returns the global history. */ inline unsigned readGlobalHist() { return globalHistory; } private: - + /** + * Returns if the branch should be taken or not, given a counter + * value. + * @param count The counter value. + */ inline bool getPrediction(uint8_t &count); + /** + * Returns the local history index, given a branch address. + * @param branch_addr The branch's PC address. + */ inline unsigned calcLocHistIdx(Addr &branch_addr); - inline void updateHistoriesTaken(unsigned local_history_idx); + /** Updates global history as taken. */ + inline void updateGlobalHistTaken(); - inline void updateHistoriesNotTaken(unsigned local_history_idx); + /** Updates global history as not taken. */ + inline void updateGlobalHistNotTaken(); + + /** + * Updates local histories as taken. + * @param local_history_idx The local history table entry that + * will be updated. + */ + inline void updateLocalHistTaken(unsigned local_history_idx); + + /** + * Updates local histories as not taken. + * @param local_history_idx The local history table entry that + * will be updated. + */ + inline void updateLocalHistNotTaken(unsigned local_history_idx); + + /** + * The branch history information that is created upon predicting + * a branch. It will be passed back upon updating and squashing, + * when the BP can use this information to update/restore its + * state properly. + */ + struct BPHistory { +#ifdef DEBUG + BPHistory() + { newCount++; } + ~BPHistory() + { newCount--; } + + static int newCount; +#endif + unsigned globalHistory; + bool localPredTaken; + bool globalPredTaken; + bool globalUsed; + }; /** Local counters. */ std::vector<SatCounter> localCtrs; @@ -103,7 +177,6 @@ class TournamentBP /** Mask to get the proper local history. */ unsigned localHistoryMask; - /** Array of counters that make up the global predictor. */ std::vector<SatCounter> globalCtrs; @@ -122,7 +195,6 @@ class TournamentBP /** Mask to get the proper global history. */ unsigned globalHistoryMask; - /** Array of counters that make up the choice predictor. */ std::vector<SatCounter> choiceCtrs; |