diff options
Diffstat (limited to 'cpu/beta_cpu')
39 files changed, 2646 insertions, 718 deletions
diff --git a/cpu/beta_cpu/2bit_local_pred.cc b/cpu/beta_cpu/2bit_local_pred.cc index 88c39a9b0..ef7f23d49 100644 --- a/cpu/beta_cpu/2bit_local_pred.cc +++ b/cpu/beta_cpu/2bit_local_pred.cc @@ -75,18 +75,34 @@ DefaultBP::getLocalIndex(Addr &branch_addr) bool DefaultBP::lookup(Addr &branch_addr) { + bool taken; uint8_t local_prediction; unsigned local_predictor_idx = getLocalIndex(branch_addr); DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n", local_predictor_idx); + assert(local_predictor_idx < localPredictorSize); + local_prediction = localCtrs[local_predictor_idx].read(); DPRINTF(Fetch, "Branch predictor: prediction is %i.\n", (int)local_prediction); - return getPrediction(local_prediction); + taken = getPrediction(local_prediction); + +#if 0 + // Speculative update. + if (taken) { + DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n"); + localCtrs[local_predictor_idx].increment(); + } else { + DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n"); + localCtrs[local_predictor_idx].decrement(); + } +#endif + + return taken; } void @@ -100,11 +116,17 @@ DefaultBP::update(Addr &branch_addr, bool taken) DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n", local_predictor_idx); + assert(local_predictor_idx < localPredictorSize); + + // Increment or decrement twice to undo speculative update, then + // properly update if (taken) { DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n"); localCtrs[local_predictor_idx].increment(); +// localCtrs[local_predictor_idx].increment(); } else { DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n"); localCtrs[local_predictor_idx].decrement(); +// localCtrs[local_predictor_idx].decrement(); } } diff --git a/cpu/beta_cpu/alpha_dyn_inst.hh b/cpu/beta_cpu/alpha_dyn_inst.hh index 4e1cebd11..c964762db 100644 --- a/cpu/beta_cpu/alpha_dyn_inst.hh +++ b/cpu/beta_cpu/alpha_dyn_inst.hh @@ -19,19 +19,19 @@ template <class Impl> class AlphaDynInst : public BaseDynInst<Impl> { public: - // Typedef for the CPU. + /** Typedef for the CPU. */ typedef typename Impl::FullCPU FullCPU; - //Typedef to get the ISA. + /** Typedef to get the ISA. */ typedef typename Impl::ISA ISA; - /// Binary machine instruction type. + /** Binary machine instruction type. */ typedef typename ISA::MachInst MachInst; - /// Memory address type. + /** Memory address type. */ typedef typename ISA::Addr Addr; - /// Logical register index type. + /** Logical register index type. */ typedef typename ISA::RegIndex RegIndex; - /// Integer register index type. + /** Integer register index type. */ typedef typename ISA::IntReg IntReg; enum { @@ -54,6 +54,7 @@ class AlphaDynInst : public BaseDynInst<Impl> return fault; } + public: uint64_t readUniq(); void setUniq(uint64_t val); diff --git a/cpu/beta_cpu/alpha_full_cpu.hh b/cpu/beta_cpu/alpha_full_cpu.hh index 0e094b122..e01eba3bf 100644 --- a/cpu/beta_cpu/alpha_full_cpu.hh +++ b/cpu/beta_cpu/alpha_full_cpu.hh @@ -29,6 +29,8 @@ class AlphaFullCPU : public FullBetaCPU<Impl> #endif public: + void regStats(); + #ifdef FULL_SYSTEM bool inPalMode(); @@ -66,14 +68,17 @@ class AlphaFullCPU : public FullBetaCPU<Impl> req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16; return No_Fault; } + Fault translateInstReq(MemReqPtr &req) { return dummyTranslation(req); } + Fault translateDataReadReq(MemReqPtr &req) { return dummyTranslation(req); } + Fault translateDataWriteReq(MemReqPtr &req) { return dummyTranslation(req); @@ -81,73 +86,6 @@ class AlphaFullCPU : public FullBetaCPU<Impl> #endif - template <class T> - Fault read(MemReqPtr &req, T &data) - { -#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM) - if (req->flags & LOCKED) { - MiscRegFile *cregs = &req->xc->regs.miscRegs; - cregs->lock_addr = req->paddr; - cregs->lock_flag = true; - } -#endif - - Fault error; - error = mem->read(req, data); - data = htoa(data); - return error; - } - - template <class T> - Fault write(MemReqPtr &req, T &data) - { -#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM) - - MiscRegFile *cregs; - - // If this is a store conditional, act appropriately - if (req->flags & LOCKED) { - cregs = &xc->regs.miscRegs; - - if (req->flags & UNCACHEABLE) { - // Don't update result register (see stq_c in isa_desc) - req->result = 2; - req->xc->storeCondFailures = 0;//Needed? [RGD] - } else { - req->result = cregs->lock_flag; - if (!cregs->lock_flag || - ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) { - cregs->lock_flag = false; - if (((++req->xc->storeCondFailures) % 100000) == 0) { - std::cerr << "Warning: " - << req->xc->storeCondFailures - << " consecutive store conditional failures " - << "on cpu " << cpu_id - << std::endl; - } - return No_Fault; - } - else req->xc->storeCondFailures = 0; - } - } - - // Need to clear any locked flags on other proccessors for - // this address. Only do this for succsful Store Conditionals - // and all other stores (WH64?). Unsuccessful Store - // Conditionals would have returned above, and wouldn't fall - // through. - for (int i = 0; i < system->execContexts.size(); i++){ - cregs = &system->execContexts[i]->regs.miscRegs; - if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) { - cregs->lock_flag = false; - } - } - -#endif - - return mem->write(req, (T)htoa(data)); - } - // Later on may want to remove this misc stuff from the regfile and // have it handled at this level. Might prove to be an issue when // trying to rename source/destination registers... @@ -240,6 +178,76 @@ class AlphaFullCPU : public FullBetaCPU<Impl> // Called by initCPU. Implement as I please. void initIPRs(RegFile *regs); #endif + + + template <class T> + Fault read(MemReqPtr &req, T &data) + { +#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM) + if (req->flags & LOCKED) { + MiscRegFile *cregs = &req->xc->regs.miscRegs; + cregs->lock_addr = req->paddr; + cregs->lock_flag = true; + } +#endif + + Fault error; + error = mem->read(req, data); + data = htoa(data); + return error; + } + + + template <class T> + Fault write(MemReqPtr &req, T &data) + { +#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM) + + MiscRegFile *cregs; + + // If this is a store conditional, act appropriately + if (req->flags & LOCKED) { + cregs = &xc->regs.miscRegs; + + if (req->flags & UNCACHEABLE) { + // Don't update result register (see stq_c in isa_desc) + req->result = 2; + req->xc->storeCondFailures = 0;//Needed? [RGD] + } else { + req->result = cregs->lock_flag; + if (!cregs->lock_flag || + ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) { + cregs->lock_flag = false; + if (((++req->xc->storeCondFailures) % 100000) == 0) { + std::cerr << "Warning: " + << req->xc->storeCondFailures + << " consecutive store conditional failures " + << "on cpu " << cpu_id + << std::endl; + } + return No_Fault; + } + else req->xc->storeCondFailures = 0; + } + } + + // Need to clear any locked flags on other proccessors for + // this address. Only do this for succsful Store Conditionals + // and all other stores (WH64?). Unsuccessful Store + // Conditionals would have returned above, and wouldn't fall + // through. + for (int i = 0; i < system->execContexts.size(); i++){ + cregs = &system->execContexts[i]->regs.miscRegs; + if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) { + cregs->lock_flag = false; + } + } + +#endif + + return mem->write(req, (T)htoa(data)); + } + }; #endif // __ALPHA_FULL_CPU_HH__ diff --git a/cpu/beta_cpu/alpha_full_cpu_builder.cc b/cpu/beta_cpu/alpha_full_cpu_builder.cc index 5fe96d656..f37081232 100644 --- a/cpu/beta_cpu/alpha_full_cpu_builder.cc +++ b/cpu/beta_cpu/alpha_full_cpu_builder.cc @@ -81,17 +81,38 @@ Param<unsigned> issueWidth; Param<unsigned> executeWidth; Param<unsigned> executeIntWidth; Param<unsigned> executeFloatWidth; +Param<unsigned> executeBranchWidth; +Param<unsigned> executeMemoryWidth; Param<unsigned> iewToCommitDelay; Param<unsigned> renameToROBDelay; Param<unsigned> commitWidth; Param<unsigned> squashWidth; +#if 0 Param<unsigned> localPredictorSize; Param<unsigned> localPredictorCtrBits; +#endif +Param<unsigned> local_predictor_size; +Param<unsigned> local_ctr_bits; +Param<unsigned> local_history_table_size; +Param<unsigned> local_history_bits; +Param<unsigned> global_predictor_size; +Param<unsigned> global_ctr_bits; +Param<unsigned> global_history_bits; +Param<unsigned> choice_predictor_size; +Param<unsigned> choice_ctr_bits; + Param<unsigned> BTBEntries; Param<unsigned> BTBTagSize; +Param<unsigned> RASSize; + +Param<unsigned> LQEntries; +Param<unsigned> SQEntries; +Param<unsigned> LFSTSize; +Param<unsigned> SSITSize; + Param<unsigned> numPhysIntRegs; Param<unsigned> numPhysFloatRegs; Param<unsigned> numIQEntries; @@ -168,6 +189,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU) INIT_PARAM(executeWidth, "Execute width"), INIT_PARAM(executeIntWidth, "Integer execute width"), INIT_PARAM(executeFloatWidth, "Floating point execute width"), + INIT_PARAM(executeBranchWidth, "Branch execute width"), + INIT_PARAM(executeMemoryWidth, "Memory execute width"), INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit " "delay"), @@ -175,12 +198,30 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU) INIT_PARAM(commitWidth, "Commit width"), INIT_PARAM(squashWidth, "Squash width"), +#if 0 INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. " "Must be a power of 2."), INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"), +#endif + INIT_PARAM(local_predictor_size, "Size of local predictor"), + INIT_PARAM(local_ctr_bits, "Bits per counter"), + INIT_PARAM(local_history_table_size, "Size of local history table"), + INIT_PARAM(local_history_bits, "Bits for the local history"), + INIT_PARAM(global_predictor_size, "Size of global predictor"), + INIT_PARAM(global_ctr_bits, "Bits per counter"), + INIT_PARAM(global_history_bits, "Bits of history"), + INIT_PARAM(choice_predictor_size, "Size of choice predictor"), + INIT_PARAM(choice_ctr_bits, "Bits of choice counters"), + INIT_PARAM(BTBEntries, "Number of BTB entries"), INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"), + INIT_PARAM(RASSize, "RAS size"), + + INIT_PARAM(LQEntries, "Number of load queue entries"), + INIT_PARAM(SQEntries, "Number of store queue entries"), + INIT_PARAM(LFSTSize, "Last fetched store table size"), + INIT_PARAM(SSITSize, "Store set ID table size"), INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"), INIT_PARAM(numPhysFloatRegs, "Number of physical floating point " @@ -277,17 +318,37 @@ CREATE_SIM_OBJECT(BaseFullCPU) params.executeWidth = executeWidth; params.executeIntWidth = executeIntWidth; params.executeFloatWidth = executeFloatWidth; + params.executeBranchWidth = executeBranchWidth; + params.executeMemoryWidth = executeMemoryWidth; params.iewToCommitDelay = iewToCommitDelay; params.renameToROBDelay = renameToROBDelay; params.commitWidth = commitWidth; params.squashWidth = squashWidth; - +#if 0 params.localPredictorSize = localPredictorSize; params.localPredictorCtrBits = localPredictorCtrBits; +#endif + params.local_predictor_size = local_predictor_size; + params.local_ctr_bits = local_ctr_bits; + params.local_history_table_size = local_history_table_size; + params.local_history_bits = local_history_bits; + params.global_predictor_size = global_predictor_size; + params.global_ctr_bits = global_ctr_bits; + params.global_history_bits = global_history_bits; + params.choice_predictor_size = choice_predictor_size; + params.choice_ctr_bits = choice_ctr_bits; + params.BTBEntries = BTBEntries; params.BTBTagSize = BTBTagSize; + params.RASSize = RASSize; + + params.LQEntries = LQEntries; + params.SQEntries = SQEntries; + params.SSITSize = SSITSize; + params.LFSTSize = LFSTSize; + params.numPhysIntRegs = numPhysIntRegs; params.numPhysFloatRegs = numPhysFloatRegs; params.numIQEntries = numIQEntries; diff --git a/cpu/beta_cpu/alpha_full_cpu_impl.hh b/cpu/beta_cpu/alpha_full_cpu_impl.hh index 8bfc0777e..ee8f9f33b 100644 --- a/cpu/beta_cpu/alpha_full_cpu_impl.hh +++ b/cpu/beta_cpu/alpha_full_cpu_impl.hh @@ -27,6 +27,19 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params ¶ms) rob.setCPU(this); } +template <class Impl> +void +AlphaFullCPU<Impl>::regStats() +{ + // Register stats for everything that has stats. + fullCPURegStats(); + fetch.regStats(); + decode.regStats(); + rename.regStats(); + iew.regStats(); + commit.regStats(); +} + #ifndef FULL_SYSTEM template <class Impl> @@ -92,6 +105,14 @@ AlphaFullCPU<Impl>::squashStages() rob.squash(rob_head); commit.setSquashing(); + + // Now hack the time buffer to clear the sequence numbers in the places + // where the stages might read it.? + for (int i = 0; i < 5; ++i) + { + timeBuffer.access(-i)->commitInfo.doneSeqNum = 0; + } + } #endif // FULL_SYSTEM @@ -178,7 +199,7 @@ template <class Impl> uint64_t * AlphaFullCPU<Impl>::getIpr() { - return regs.ipr; + return regFile.getIpr(); } template <class Impl> @@ -564,7 +585,7 @@ AlphaFullCPU<Impl>::setIntrFlag(int val) regs.intrflag = val; } -// Maybe have this send back from IEW stage to squash and update PC. +// Can force commit stage to squash and stuff. template <class Impl> Fault AlphaFullCPU<Impl>::hwrei() diff --git a/cpu/beta_cpu/alpha_params.hh b/cpu/beta_cpu/alpha_params.hh index 92dfd35f5..ecde4b016 100644 --- a/cpu/beta_cpu/alpha_params.hh +++ b/cpu/beta_cpu/alpha_params.hh @@ -72,6 +72,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params unsigned executeWidth; unsigned executeIntWidth; unsigned executeFloatWidth; + unsigned executeBranchWidth; + unsigned executeMemoryWidth; // // Commit @@ -84,11 +86,26 @@ class AlphaSimpleParams : public BaseFullCPU::Params // // Branch predictor (BP & BTB) // +/* unsigned localPredictorSize; unsigned localPredictorCtrBits; +*/ + + unsigned local_predictor_size; + unsigned local_ctr_bits; + unsigned local_history_table_size; + unsigned local_history_bits; + unsigned global_predictor_size; + unsigned global_ctr_bits; + unsigned global_history_bits; + unsigned choice_predictor_size; + unsigned choice_ctr_bits; + unsigned BTBEntries; unsigned BTBTagSize; + unsigned RASSize; + // // Load store queue // @@ -96,6 +113,12 @@ class AlphaSimpleParams : public BaseFullCPU::Params unsigned SQEntries; // + // Memory dependence + // + unsigned SSITSize; + unsigned LFSTSize; + + // // Miscellaneous // unsigned numPhysIntRegs; diff --git a/cpu/beta_cpu/bpred_unit.cc b/cpu/beta_cpu/bpred_unit.cc index 6de2def44..c4a79fbbe 100644 --- a/cpu/beta_cpu/bpred_unit.cc +++ b/cpu/beta_cpu/bpred_unit.cc @@ -1,5 +1,6 @@ #include "cpu/beta_cpu/bpred_unit_impl.hh" #include "cpu/beta_cpu/alpha_impl.hh" +#include "cpu/beta_cpu/alpha_dyn_inst.hh" -template DefaultBPredUnit<AlphaSimpleImpl>; +template TwobitBPredUnit<AlphaSimpleImpl>; diff --git a/cpu/beta_cpu/bpred_unit.hh b/cpu/beta_cpu/bpred_unit.hh index 71191f5b7..53c7146c5 100644 --- a/cpu/beta_cpu/bpred_unit.hh +++ b/cpu/beta_cpu/bpred_unit.hh @@ -4,9 +4,15 @@ // For Addr type. #include "arch/alpha/isa_traits.hh" +#include "base/statistics.hh" +#include "cpu/inst_seq.hh" #include "cpu/beta_cpu/2bit_local_pred.hh" +#include "cpu/beta_cpu/tournament_pred.hh" #include "cpu/beta_cpu/btb.hh" +#include "cpu/beta_cpu/ras.hh" + +#include <list> /** * Basically a wrapper class to hold both the branch predictor @@ -18,34 +24,86 @@ * object, and be able to call the constructors on the BP and BTB. */ template<class Impl> -class DefaultBPredUnit +class TwobitBPredUnit { public: typedef typename Impl::Params Params; + typedef typename Impl::DynInstPtr DynInstPtr; + + TwobitBPredUnit(Params ¶ms); + + void regStats(); + + bool predict(DynInstPtr &inst, Addr &PC); + + void squash(const InstSeqNum &squashed_sn, const Addr &corr_target, + bool actually_taken); - DefaultBPredUnit(Params ¶ms); + void squash(const InstSeqNum &squashed_sn); + + void update(const InstSeqNum &done_sn); bool BPLookup(Addr &inst_PC) { return BP.lookup(inst_PC); } + unsigned BPReadGlobalHist() + { return 0; } + bool BTBValid(Addr &inst_PC) { return BTB.valid(inst_PC); } Addr BTBLookup(Addr &inst_PC) { return BTB.lookup(inst_PC); } - void BPUpdate(Addr &inst_PC, bool taken) + // Will want to include global history. + void BPUpdate(Addr &inst_PC, unsigned global_history, bool taken) { BP.update(inst_PC, taken); } void BTBUpdate(Addr &inst_PC, Addr &target_PC) { BTB.update(inst_PC, target_PC); } private: + struct PredictorHistory { + PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC, + const bool pred_taken) + : seqNum(seq_num), PC(inst_PC), predTaken(pred_taken), + globalHistory(0), usedRAS(0), wasCall(0), RASIndex(0), + RASTarget(0) + { } + + InstSeqNum seqNum; + + Addr PC; + + bool predTaken; + + unsigned globalHistory; + + bool usedRAS; + + bool wasCall; + + unsigned RASIndex; + + Addr RASTarget; + }; + + std::list<PredictorHistory> predHist; DefaultBP BP; DefaultBTB BTB; + ReturnAddrStack RAS; + + Stats::Scalar<> lookups; + Stats::Scalar<> condPredicted; + Stats::Scalar<> condIncorrect; + Stats::Scalar<> BTBLookups; + Stats::Scalar<> BTBHits; + Stats::Scalar<> BTBCorrect; + Stats::Scalar<> usedRAS; + Stats::Scalar<> RASIncorrect; }; #endif // __BPRED_UNIT_HH__ diff --git a/cpu/beta_cpu/bpred_unit_impl.hh b/cpu/beta_cpu/bpred_unit_impl.hh index 47415ce9b..02c613d34 100644 --- a/cpu/beta_cpu/bpred_unit_impl.hh +++ b/cpu/beta_cpu/bpred_unit_impl.hh @@ -1,13 +1,247 @@ #include "cpu/beta_cpu/bpred_unit.hh" +#include "base/traceflags.hh" +#include "base/trace.hh" template<class Impl> -DefaultBPredUnit<Impl>::DefaultBPredUnit(Params ¶ms) - : BP(params.localPredictorSize, - params.localPredictorCtrBits, +TwobitBPredUnit<Impl>::TwobitBPredUnit(Params ¶ms) + : BP(params.local_predictor_size, + params.local_ctr_bits, params.instShiftAmt), BTB(params.BTBEntries, params.BTBTagSize, - params.instShiftAmt) + params.instShiftAmt), + RAS(params.RASSize) { } + +template <class Impl> +void +TwobitBPredUnit<Impl>::regStats() +{ + lookups + .name(name() + ".BPredUnit.lookups") + .desc("Number of BP lookups") + ; + + condPredicted + .name(name() + ".BPredUnit.condPredicted") + .desc("Number of conditional branches predicted") + ; + + condIncorrect + .name(name() + ".BPredUnit.condIncorrect") + .desc("Number of conditional branches incorrect") + ; + + BTBLookups + .name(name() + ".BPredUnit.BTBLookups") + .desc("Number of BTB lookups") + ; + + BTBHits + .name(name() + ".BPredUnit.BTBHits") + .desc("Number of BTB hits") + ; + + BTBCorrect + .name(name() + ".BPredUnit.BTBCorrect") + .desc("Number of correct BTB predictions (this stat may not " + "work properly.") + ; + + usedRAS + .name(name() + ".BPredUnit.usedRAS") + .desc("Number of times the RAS was used.") + ; + + RASIncorrect + .name(name() + ".BPredUnit.RASInCorrect") + .desc("Number of incorrect RAS predictions.") + ; +} + +template <class Impl> +bool +TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC) +{ + // See if branch predictor predicts taken. + // If so, get its target addr either from the BTB or the RAS. + // Once that's done, speculatively update the predictor? + // Save off record of branch stuff so the RAS can be fixed + // up once it's done. + + bool pred_taken = false; + Addr target; + + ++lookups; + + if (inst->isUncondCtrl()) { + DPRINTF(Fetch, "BranchPred: Unconditional control.\n"); + pred_taken = true; + } else { + ++condPredicted; + + pred_taken = BPLookup(PC); + + DPRINTF(Fetch, "BranchPred: Branch predictor predicted %i for PC %#x" + "\n", pred_taken, inst->readPC()); + } + + PredictorHistory predict_record(inst->seqNum, PC, pred_taken); + + // Now lookup in the BTB or RAS. + if (pred_taken) { + if (inst->isReturn()) { + ++usedRAS; + + // If it's a function return call, then look up the address + // in the RAS. + target = RAS.top(); + + // Record the top entry of the RAS, and its index. + predict_record.usedRAS = true; + predict_record.RASIndex = RAS.topIdx(); + predict_record.RASTarget = target; + + RAS.pop(); + + DPRINTF(Fetch, "BranchPred: Instruction %#x is a return, RAS " + "predicted target: %#x, RAS index: %i.\n", + inst->readPC(), target, predict_record.RASIndex); + } else { + ++BTBLookups; + + if (inst->isCall()) { + RAS.push(PC+sizeof(MachInst)); + + // Record that it was a call so that the top RAS entry can + // be popped off if the speculation is incorrect. + predict_record.wasCall = true; + + DPRINTF(Fetch, "BranchPred: Instruction %#x was a call, " + "adding %#x to the RAS.\n", + inst->readPC(), PC+sizeof(MachInst)); + } + + if (BTB.valid(PC)) { + ++BTBHits; + + //If it's anything else, use the BTB to get the target addr. + target = BTB.lookup(PC); + + DPRINTF(Fetch, "BranchPred: Instruction %#x predicted target " + "is %#x.\n", inst->readPC(), target); + + } else { + DPRINTF(Fetch, "BranchPred: BTB doesn't have a valid entry." + "\n"); + pred_taken = false; + } + + } + } + + if (pred_taken) { + // Set the PC and the instruction's predicted target. + PC = target; + inst->setPredTarg(target); + } else { + PC = PC + sizeof(MachInst); + inst->setPredTarg(PC); + } + + predHist.push_front(predict_record); + + assert(!predHist.empty()); + + return pred_taken; +} + +template <class Impl> +void +TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn) +{ + DPRINTF(Fetch, "BranchPred: Commiting branches until sequence number " + "%i.\n", done_sn); + + while (!predHist.empty() && predHist.back().seqNum <= done_sn) { + assert(!predHist.empty()); + + // Update the branch predictor with the correct results of branches. + BP.update(predHist.back().PC, predHist.back().predTaken); + + predHist.pop_back(); + } +} + +template <class Impl> +void +TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn) +{ + while (!predHist.empty() && predHist.front().seqNum > squashed_sn) { + if (predHist.front().usedRAS) { + DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, " + "target: %#x.\n", + predHist.front().RASIndex, + predHist.front().RASTarget); + + RAS.restore(predHist.front().RASIndex, + predHist.front().RASTarget); + } else if (predHist.front().wasCall) { + DPRINTF(Fetch, "BranchPred: Removing speculative entry added " + "to the RAS.\n"); + + RAS.pop(); + } + + predHist.pop_front(); + } +} + +template <class Impl> +void +TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, + const Addr &corr_target, + const bool actually_taken) +{ + // Now that we know that a branch was mispredicted, we need to undo + // all the branches that have been seen up until this branch and + // fix up everything. + + ++condIncorrect; + + DPRINTF(Fetch, "BranchPred: Squashing from sequence number %i, " + "setting target to %#x.\n", + squashed_sn, corr_target); + + while (!predHist.empty() && predHist.front().seqNum > squashed_sn) { + + if (predHist.front().usedRAS) { + DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, " + "target: %#x.\n", + predHist.front().RASIndex, + predHist.front().RASTarget); + + RAS.restore(predHist.front().RASIndex, + predHist.front().RASTarget); + } else if (predHist.front().wasCall) { + DPRINTF(Fetch, "BranchPred: Removing speculative entry added " + "to the RAS.\n"); + + RAS.pop(); + } + + predHist.pop_front(); + } + + predHist.front().predTaken = actually_taken; + + if (predHist.front().usedRAS) { + ++RASIncorrect; + } + + BP.update(predHist.front().PC, actually_taken); + + BTB.update(predHist.front().PC, corr_target); +} diff --git a/cpu/beta_cpu/btb.cc b/cpu/beta_cpu/btb.cc index b49f30482..bceaa66d1 100644 --- a/cpu/beta_cpu/btb.cc +++ b/cpu/beta_cpu/btb.cc @@ -50,6 +50,8 @@ DefaultBTB::valid(const Addr &inst_PC) Addr inst_tag = getTag(inst_PC); + assert(btb_idx < numEntries); + if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) { return true; } else { @@ -67,6 +69,8 @@ DefaultBTB::lookup(const Addr &inst_PC) Addr inst_tag = getTag(inst_PC); + assert(btb_idx < numEntries); + if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) { return btb[btb_idx].target; } else { @@ -79,6 +83,8 @@ DefaultBTB::update(const Addr &inst_PC, const Addr &target) { unsigned btb_idx = getIndex(inst_PC); + assert(btb_idx < numEntries); + btb[btb_idx].valid = true; btb[btb_idx].target = target; btb[btb_idx].tag = getTag(inst_PC); diff --git a/cpu/beta_cpu/comm.hh b/cpu/beta_cpu/comm.hh index 849a6c797..e327a83b9 100644 --- a/cpu/beta_cpu/comm.hh +++ b/cpu/beta_cpu/comm.hh @@ -9,6 +9,7 @@ using namespace std; // Find better place to put this typedef. +// The impl might be the best place for this. typedef short int PhysRegIndex; template<class Impl> @@ -45,6 +46,14 @@ struct SimpleIEWSimpleCommit { int size; DynInstPtr insts[Impl::MaxWidth + 1]; + + bool squash; + bool branchMispredict; + bool branchTaken; + uint64_t mispredPC; + uint64_t nextPC; + unsigned globalHist; + InstSeqNum squashedSeqNum; }; template<class Impl> @@ -63,10 +72,15 @@ struct TimeBufStruct { bool predIncorrect; uint64_t branchAddr; + InstSeqNum doneSeqNum; + + // Might want to package this kind of branch stuff into a single + // struct as it is used pretty frequently. bool branchMispredict; bool branchTaken; uint64_t mispredPC; uint64_t nextPC; + unsigned globalHist; }; decodeComm decodeInfo; @@ -84,17 +98,10 @@ struct TimeBufStruct { renameComm renameInfo; struct iewComm { - bool squash; bool stall; // Also eventually include skid buffer space. unsigned freeIQEntries; - - bool branchMispredict; - bool branchTaken; - uint64_t mispredPC; - uint64_t nextPC; - InstSeqNum squashedSeqNum; }; iewComm iewInfo; @@ -108,6 +115,7 @@ struct TimeBufStruct { bool branchTaken; uint64_t mispredPC; uint64_t nextPC; + unsigned globalHist; // Think of better names here. // Will need to be a variety of sizes... diff --git a/cpu/beta_cpu/commit.hh b/cpu/beta_cpu/commit.hh index 981d9e78f..f1a185143 100644 --- a/cpu/beta_cpu/commit.hh +++ b/cpu/beta_cpu/commit.hh @@ -59,6 +59,8 @@ class SimpleCommit public: SimpleCommit(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu_ptr); void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); @@ -142,6 +144,17 @@ class SimpleCommit /** Commit width, in instructions. */ unsigned commitWidth; + + Stats::Scalar<> commitCommittedInsts; + Stats::Scalar<> commitSquashedInsts; + Stats::Scalar<> commitSquashEvents; + Stats::Scalar<> commitNonSpecStalls; + Stats::Scalar<> commitCommittedBranches; + Stats::Scalar<> commitCommittedLoads; + Stats::Scalar<> commitCommittedMemRefs; + Stats::Scalar<> branchMispredicts; + + Stats::Distribution<> n_committed_dist; }; #endif // __SIMPLE_COMMIT_HH__ diff --git a/cpu/beta_cpu/commit_impl.hh b/cpu/beta_cpu/commit_impl.hh index 45b8bc7de..9a69c9259 100644 --- a/cpu/beta_cpu/commit_impl.hh +++ b/cpu/beta_cpu/commit_impl.hh @@ -23,6 +23,51 @@ SimpleCommit<Impl>::SimpleCommit(Params ¶ms) template <class Impl> void +SimpleCommit<Impl>::regStats() +{ + commitCommittedInsts + .name(name() + ".commitCommittedInsts") + .desc("The number of committed instructions") + .prereq(commitCommittedInsts); + commitSquashedInsts + .name(name() + ".commitSquashedInsts") + .desc("The number of squashed insts skipped by commit") + .prereq(commitSquashedInsts); + commitSquashEvents + .name(name() + ".commitSquashEvents") + .desc("The number of times commit is told to squash") + .prereq(commitSquashEvents); + commitNonSpecStalls + .name(name() + ".commitNonSpecStalls") + .desc("The number of times commit has been forced to stall to " + "communicate backwards") + .prereq(commitNonSpecStalls); + commitCommittedBranches + .name(name() + ".commitCommittedBranches") + .desc("The number of committed branches") + .prereq(commitCommittedBranches); + commitCommittedLoads + .name(name() + ".commitCommittedLoads") + .desc("The number of committed loads") + .prereq(commitCommittedLoads); + commitCommittedMemRefs + .name(name() + ".commitCommittedMemRefs") + .desc("The number of committed memory references") + .prereq(commitCommittedMemRefs); + branchMispredicts + .name(name() + ".branchMispredicts") + .desc("The number of times a branch was mispredicted") + .prereq(branchMispredicts); + n_committed_dist + .init(0,commitWidth,1) + .name(name() + ".COM:committed_per_cycle") + .desc("Number of insts commited each cycle") + .flags(Stats::pdf) + ; +} + +template <class Impl> +void SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr) { DPRINTF(Commit, "Commit: Setting CPU pointer.\n"); @@ -143,12 +188,12 @@ SimpleCommit<Impl>::commit() // Should I also check if the commit stage is telling the ROB to squah? // This might be necessary to keep the same timing between the IQ and // the ROB... - if (robInfoFromIEW->iewInfo.squash) { + if (fromIEW->squash) { DPRINTF(Commit, "Commit: Squashing instructions in the ROB.\n"); _status = ROBSquashing; - InstSeqNum squashed_inst = robInfoFromIEW->iewInfo.squashedSeqNum; + InstSeqNum squashed_inst = fromIEW->squashedSeqNum; rob->squash(squashed_inst); @@ -162,15 +207,19 @@ SimpleCommit<Impl>::commit() // ROB is in the process of squashing. toIEW->commitInfo.robSquashing = true; - toIEW->commitInfo.branchMispredict = - robInfoFromIEW->iewInfo.branchMispredict; + toIEW->commitInfo.branchMispredict = fromIEW->branchMispredict; + + toIEW->commitInfo.branchTaken = fromIEW->branchTaken; + + toIEW->commitInfo.nextPC = fromIEW->nextPC; - toIEW->commitInfo.branchTaken = - robInfoFromIEW->iewInfo.branchTaken; + toIEW->commitInfo.mispredPC = fromIEW->mispredPC; - toIEW->commitInfo.nextPC = robInfoFromIEW->iewInfo.nextPC; + toIEW->commitInfo.globalHist = fromIEW->globalHist; - toIEW->commitInfo.mispredPC = robInfoFromIEW->iewInfo.mispredPC; + if (toIEW->commitInfo.branchMispredict) { + ++branchMispredicts; + } } if (_status != ROBSquashing) { @@ -237,6 +286,8 @@ SimpleCommit<Impl>::commitInsts() // inst in the ROB without affecting any other stages. rob->retireHead(); + ++commitSquashedInsts; + } else { // Increment the total number of non-speculative instructions // executed. @@ -249,7 +300,7 @@ SimpleCommit<Impl>::commitInsts() bool commit_success = commitHead(head_inst, num_committed); // Update what instruction we are looking at if the commit worked. - if(commit_success) { + if (commit_success) { ++num_committed; // Send back which instruction has been committed. @@ -258,7 +309,11 @@ SimpleCommit<Impl>::commitInsts() // sequence number instead (copy). toIEW->commitInfo.doneSeqNum = head_inst->seqNum; - cpu->instDone(); + ++commitCommittedInsts; + + if (!head_inst->isNop()) { + cpu->instDone(); + } } else { break; } @@ -267,6 +322,8 @@ SimpleCommit<Impl>::commitInsts() // Update the pointer to read the next instruction in the ROB. head_inst = rob->readHeadInst(); } + + n_committed_dist.sample(num_committed); } template <class Impl> @@ -276,18 +333,13 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // Make sure instruction is valid assert(head_inst); - Fault fault = No_Fault; - - // If the head instruction is a store or a load, then execute it - // because this simple model does no speculative memory access. - // Hopefully this covers all memory references. - // Also check if it's nonspeculative. Or a nop. Then it will be - // executed only when it reaches the head of the ROB. Actually - // executing a nop is a bit overkill... + // If the instruction is not executed yet, then it is a non-speculative + // or store inst. Signal backwards that it should be executed. if (!head_inst->isExecuted()) { // Keep this number correct. We have not yet actually executed // and committed this instruction. cpu->funcExeInst--; + if (head_inst->isStore() || head_inst->isNonSpeculative()) { DPRINTF(Commit, "Commit: Encountered a store or non-speculative " "instruction at the head of the ROB, PC %#x.\n", @@ -299,6 +351,8 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // it is executed. head_inst->clearCanCommit(); + ++commitNonSpecStalls; + return false; } else { panic("Commit: Trying to commit un-executed instruction " @@ -306,19 +360,6 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) } } - // Check if memory access was successful. - if (fault != No_Fault) { - // Handle data cache miss here. In the future, set the status - // to data cache miss, then exit the stage. Have an event - // that handles commiting the head instruction, then setting - // the stage back to running, when the event is run. (just - // make sure that event is commit's run for that cycle) - panic("Commit: Load/store instruction failed, not sure what " - "to do.\n"); - // Also will want to clear the instruction's fault after being - // handled here so it's not handled again below. - } - // Now check if it's one of the special trap or barrier or // serializing instructions. if (head_inst->isThreadSync() || @@ -335,39 +376,43 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // Check if the instruction caused a fault. If so, trap. if (head_inst->getFault() != No_Fault) { + if (!head_inst->isNop()) { #ifdef FULL_SYSTEM - cpu->trap(fault); + cpu->trap(fault); #else // !FULL_SYSTEM - if (!head_inst->isNop()) { panic("fault (%d) detected @ PC %08p", head_inst->getFault(), head_inst->PC); - } #endif // FULL_SYSTEM + } } // Check if we're really ready to commit. If not then return false. // I'm pretty sure all instructions should be able to commit if they've // reached this far. For now leave this in as a check. if(!rob->isHeadReady()) { - DPRINTF(Commit, "Commit: Unable to commit head instruction!\n"); + panic("Commit: Unable to commit head instruction!\n"); return false; } // If it's a branch, then send back branch prediction update info // to the fetch stage. // This should be handled in the iew stage if a mispredict happens... -#if 0 + if (head_inst->isControl()) { +#if 0 toIEW->nextPC = head_inst->readPC(); //Maybe switch over to BTB incorrect. toIEW->btbMissed = head_inst->btbMiss(); toIEW->target = head_inst->nextPC; //Maybe also include global history information. //This simple version will have no branch prediction however. - } #endif + ++commitCommittedBranches; + } + + #if 0 // Check if the instruction has a destination register. // If so add the previous physical register of its logical register's @@ -383,8 +428,12 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // the LDSTQ will already have been told that a store has reached the head // of the ROB. Consider including communication if it's a store as well // to keep things orthagonal. - if (head_inst->isLoad()) { - toIEW->commitInfo.commitIsLoad = true; + if (head_inst->isMemRef()) { + ++commitCommittedMemRefs; + if (head_inst->isLoad()) { + toIEW->commitInfo.commitIsLoad = true; + ++commitCommittedLoads; + } } // Now that the instruction is going to be committed, finalize its diff --git a/cpu/beta_cpu/cpu_policy.hh b/cpu/beta_cpu/cpu_policy.hh index ec8460b77..1479eb191 100644 --- a/cpu/beta_cpu/cpu_policy.hh +++ b/cpu/beta_cpu/cpu_policy.hh @@ -22,7 +22,7 @@ template<class Impl> struct SimpleCPUPolicy { - typedef DefaultBPredUnit<Impl> BPredUnit; + typedef TwobitBPredUnit<Impl> BPredUnit; typedef PhysRegFile<Impl> RegFile; typedef SimpleFreeList FreeList; typedef SimpleRenameMap RenameMap; diff --git a/cpu/beta_cpu/decode.hh b/cpu/beta_cpu/decode.hh index be88a4b36..64e87290e 100644 --- a/cpu/beta_cpu/decode.hh +++ b/cpu/beta_cpu/decode.hh @@ -49,6 +49,8 @@ class SimpleDecode public: SimpleDecode(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu_ptr); void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); @@ -128,6 +130,15 @@ class SimpleDecode * group of instructions, it can restart at the proper instruction. */ unsigned numInst; + + Stats::Scalar<> decodeIdleCycles; + Stats::Scalar<> decodeBlockedCycles; + Stats::Scalar<> decodeUnblockCycles; + Stats::Scalar<> decodeSquashCycles; + Stats::Scalar<> decodeBranchMispred; + Stats::Scalar<> decodeControlMispred; + Stats::Scalar<> decodeDecodedInsts; + Stats::Scalar<> decodeSquashedInsts; }; #endif // __SIMPLE_DECODE_HH__ diff --git a/cpu/beta_cpu/decode_impl.hh b/cpu/beta_cpu/decode_impl.hh index d0f46eaa5..8b20bf8bc 100644 --- a/cpu/beta_cpu/decode_impl.hh +++ b/cpu/beta_cpu/decode_impl.hh @@ -16,6 +16,45 @@ SimpleDecode<Impl>::SimpleDecode(Params ¶ms) _status = Idle; } +template <class Impl> +void +SimpleDecode<Impl>::regStats() +{ + decodeIdleCycles + .name(name() + ".decodeIdleCycles") + .desc("Number of cycles decode is idle") + .prereq(decodeIdleCycles); + decodeBlockedCycles + .name(name() + ".decodeBlockedCycles") + .desc("Number of cycles decode is blocked") + .prereq(decodeBlockedCycles); + decodeUnblockCycles + .name(name() + ".decodeUnblockCycles") + .desc("Number of cycles decode is unblocking") + .prereq(decodeUnblockCycles); + decodeSquashCycles + .name(name() + ".decodeSquashCycles") + .desc("Number of cycles decode is squashing") + .prereq(decodeSquashCycles); + decodeBranchMispred + .name(name() + ".decodeBranchMispred") + .desc("Number of times decode detected a branch misprediction") + .prereq(decodeBranchMispred); + decodeControlMispred + .name(name() + ".decodeControlMispred") + .desc("Number of times decode detected an instruction incorrectly" + " predicted as a control") + .prereq(decodeControlMispred); + decodeDecodedInsts + .name(name() + ".decodeDecodedInsts") + .desc("Number of instructions handled by decode") + .prereq(decodeDecodedInsts); + decodeSquashedInsts + .name(name() + ".decodeSquashedInsts") + .desc("Number of squashed instructions handled by decode") + .prereq(decodeSquashedInsts); +} + template<class Impl> void SimpleDecode<Impl>::setCPU(FullCPU *cpu_ptr) @@ -91,7 +130,7 @@ SimpleDecode<Impl>::unblock() // If there's still information in the skid buffer, then // continue to tell previous stages to stall. They will be - // able to restart once the skid buffer is empty. + // able to restart once the skid buffer is empty. if (!skidBuffer.empty()) { toFetch->decodeInfo.stall = true; } else { @@ -110,9 +149,12 @@ SimpleDecode<Impl>::squash(DynInstPtr &inst) "detected at decode.\n"); Addr new_PC = inst->nextPC; + toFetch->decodeInfo.branchMispredict = true; + toFetch->decodeInfo.doneSeqNum = inst->seqNum; toFetch->decodeInfo.predIncorrect = true; toFetch->decodeInfo.squash = true; toFetch->decodeInfo.nextPC = new_PC; + toFetch->decodeInfo.branchTaken = true; // Set status to squashing. _status = Squashing; @@ -164,6 +206,8 @@ SimpleDecode<Impl>::tick() // buffer were used. Remove those instructions and handle // the rest of unblocking. if (_status == Unblocking) { + ++decodeUnblockCycles; + if (fromFetch->size > 0) { // Add the current inputs to the skid buffer so they can be // reprocessed when this stage unblocks. @@ -173,6 +217,8 @@ SimpleDecode<Impl>::tick() unblock(); } } else if (_status == Blocked) { + ++decodeBlockedCycles; + if (fromFetch->size > 0) { block(); } @@ -197,6 +243,8 @@ SimpleDecode<Impl>::tick() squash(); } } else if (_status == Squashing) { + ++decodeSquashCycles; + if (!fromCommit->commitInfo.squash && !fromCommit->commitInfo.robSquashing) { _status = Running; @@ -228,17 +276,16 @@ SimpleDecode<Impl>::decode() // Check fetch queue to see if instructions are available. // If no available instructions, do nothing, unless this stage is // currently unblocking. - if (!fromFetch->insts[0] && _status != Unblocking) { + if (fromFetch->size == 0 && _status != Unblocking) { DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n"); // Should I change the status to idle? + ++decodeIdleCycles; return; } + // Might be better to use a base DynInst * instead? DynInstPtr inst; - // Instead have a class member variable that records which instruction - // was the last one that was ended on. At the tick() stage, it can - // check if that's equal to 0. If not, then don't pop stuff off. unsigned to_rename_index = 0; int insts_available = _status == Unblocking ? @@ -264,18 +311,10 @@ SimpleDecode<Impl>::decode() } #endif - // Check to make sure that instructions coming from fetch are valid. - // Normally at this stage the branch target of PC-relative branches - // should be computed here. However in this simple model all - // computation will take place at execute. Hence doneTargCalc() - // will always be false. while (insts_available > 0) { DPRINTF(Decode, "Decode: Sending instruction to rename.\n"); - // Might create some sort of accessor to get an instruction - // on a per thread basis. Or might be faster to just get - // a pointer to an array or list of instructions and use that - // within this code. + inst = _status == Unblocking ? skidBuffer.front().insts[numInst] : fromFetch->insts[numInst]; @@ -287,6 +326,8 @@ SimpleDecode<Impl>::decode() "squashed, skipping.\n", inst->seqNum, inst->readPC()); + ++decodeSquashedInsts; + ++numInst; --insts_available; @@ -305,16 +346,22 @@ SimpleDecode<Impl>::decode() if (inst->predTaken() && !inst->isControl()) { panic("Instruction predicted as a branch!"); + ++decodeControlMispred; // Might want to set some sort of boolean and just do // a check at the end squash(inst); break; } - // Ensure that the predicted branch target is the actual branch - // target if possible (branches that are PC relative). - if (inst->isControl() && inst->doneTargCalc()) { + // Go ahead and compute any PC-relative branches. + + if (inst->isDirectCtrl() && inst->isUncondCtrl() && + inst->numDestRegs() == 0 && inst->numSrcRegs() == 0) { + inst->execute(); + inst->setExecuted(); + if (inst->mispredicted()) { + ++decodeBranchMispred; // Might want to set some sort of boolean and just do // a check at the end squash(inst); @@ -322,6 +369,11 @@ SimpleDecode<Impl>::decode() } } + // Normally can check if a direct branch has the right target + // addr (either the immediate, or the branch PC + 4) and redirect + // fetch if it's incorrect. + + // Also check if instructions have no source registers. Mark // them as ready to issue at any time. Not sure if this check // should exist here or at a later stage; however it doesn't matter @@ -334,6 +386,7 @@ SimpleDecode<Impl>::decode() // Increment which instruction we're looking at. ++numInst; ++to_rename_index; + ++decodeDecodedInsts; --insts_available; } diff --git a/cpu/beta_cpu/fetch.hh b/cpu/beta_cpu/fetch.hh index e59a9df7f..4cfc2f167 100644 --- a/cpu/beta_cpu/fetch.hh +++ b/cpu/beta_cpu/fetch.hh @@ -14,6 +14,7 @@ #include "sim/eventq.hh" #include "cpu/pc_event.hh" #include "mem/mem_interface.hh" +#include "base/statistics.hh" /** * SimpleFetch class to fetch a single instruction each cycle. SimpleFetch @@ -59,6 +60,8 @@ class SimpleFetch /** SimpleFetch constructor. */ SimpleFetch(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu_ptr); void setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer); @@ -73,9 +76,13 @@ class SimpleFetch // private: // Figure out PC vs next PC and how it should be updated - void squash(Addr newPC); + void squash(const Addr &new_PC); private: + inline void doSquash(const Addr &new_PC); + + void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num); + /** * Looks up in the branch predictor to see if the next PC should be * either next PC+=MachInst or a branch target. @@ -84,7 +91,27 @@ class SimpleFetch * the next PC will be. * @return Whether or not a branch was predicted as taken. */ - bool lookupAndUpdateNextPC(Addr &next_PC); + bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC); + + // Might not want this function... +// inline void recordGlobalHist(DynInstPtr &inst); + + /** + * Fetches the cache line that contains fetch_PC. Returns any + * fault that happened. Puts the data into the class variable + * cacheData. + * @params fetch_PC The PC address that is being fetched from. + * @return Any fault that occured. + */ + Fault fetchCacheLine(Addr fetch_PC); + + // Align an address (typically a PC) to the start of an I-cache block. + // We fold in the PISA 64- to 32-bit conversion here as well. + Addr icacheBlockAlignPC(Addr addr) + { + addr = ISA::realPCToFetchPC(addr); + return (addr & ~(cacheBlkMask)); + } public: class CacheCompletionEvent : public Event @@ -99,7 +126,7 @@ class SimpleFetch virtual const char *description(); }; - CacheCompletionEvent cacheCompletionEvent; +// CacheCompletionEvent cacheCompletionEvent; private: /** Pointer to the FullCPU. */ @@ -152,20 +179,32 @@ class SimpleFetch unsigned fetchWidth; /** Cache block size. */ - int blkSize; + int cacheBlkSize; /** Mask to get a cache block's address. */ - Addr cacheBlockMask; + Addr cacheBlkMask; /** The instruction being fetched. */ - MachInst inst; +// MachInst inst; + + /** The cache line being fetched. */ + uint8_t *cacheData; /** Size of instructions. */ int instSize; /** Icache stall statistics. */ -// Stats::Scalar<> icacheStallCycles; -// Counter lastIcacheStall; + Counter lastIcacheStall; + + Stats::Scalar<> icacheStallCycles; + Stats::Scalar<> fetchedInsts; + Stats::Scalar<> predictedBranches; + Stats::Scalar<> fetchCycles; + Stats::Scalar<> fetchSquashCycles; + Stats::Scalar<> fetchBlockedCycles; + Stats::Scalar<> fetchedCacheLines; + + Stats::Distribution<> fetch_nisn_dist; }; #endif //__SIMPLE_FETCH_HH__ diff --git a/cpu/beta_cpu/fetch_impl.hh b/cpu/beta_cpu/fetch_impl.hh index 93f7bf6d2..8c9cf9f41 100644 --- a/cpu/beta_cpu/fetch_impl.hh +++ b/cpu/beta_cpu/fetch_impl.hh @@ -1,10 +1,8 @@ -// Todo: Add in branch prediction. With probe path, should -// be able to specify -// size of data to fetch. Will be able to get full cache line. - -// Remove this later. +// Remove this later; used only for debugging. #define OPCODE(X) (X >> 26) & 0x3f + +#include "arch/alpha/byte_swap.hh" #include "cpu/exetrace.hh" #include "mem/base_mem.hh" #include "mem/mem_interface.hh" @@ -37,15 +35,14 @@ SimpleFetch<Impl>::CacheCompletionEvent::description() template<class Impl> SimpleFetch<Impl>::SimpleFetch(Params ¶ms) - : cacheCompletionEvent(this), + : //cacheCompletionEvent(this), icacheInterface(params.icacheInterface), branchPred(params), decodeToFetchDelay(params.decodeToFetchDelay), renameToFetchDelay(params.renameToFetchDelay), iewToFetchDelay(params.iewToFetchDelay), commitToFetchDelay(params.commitToFetchDelay), - fetchWidth(params.fetchWidth), - inst(0) + fetchWidth(params.fetchWidth) { // Set status to idle. _status = Idle; @@ -62,13 +59,63 @@ SimpleFetch<Impl>::SimpleFetch(Params ¶ms) memReq->data = new uint8_t[64]; // Size of cache block. - blkSize = icacheInterface ? icacheInterface->getBlockSize() : 64; + cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64; // Create mask to get rid of offset bits. - cacheBlockMask = (blkSize - 1); + cacheBlkMask = (cacheBlkSize - 1); // Get the size of an instruction. instSize = sizeof(MachInst); + + // Create space to store a cache line. + cacheData = new uint8_t[cacheBlkSize]; +} + +template <class Impl> +void +SimpleFetch<Impl>::regStats() +{ + icacheStallCycles + .name(name() + ".icacheStallCycles") + .desc("Number of cycles fetch is stalled on an Icache miss") + .prereq(icacheStallCycles); + + fetchedInsts + .name(name() + ".fetchedInsts") + .desc("Number of instructions fetch has processed") + .prereq(fetchedInsts); + predictedBranches + .name(name() + ".predictedBranches") + .desc("Number of branches that fetch has predicted taken") + .prereq(predictedBranches); + fetchCycles + .name(name() + ".fetchCycles") + .desc("Number of cycles fetch has run and was not squashing or" + " blocked") + .prereq(fetchCycles); + fetchSquashCycles + .name(name() + ".fetchSquashCycles") + .desc("Number of cycles fetch has spent squashing") + .prereq(fetchSquashCycles); + fetchBlockedCycles + .name(name() + ".fetchBlockedCycles") + .desc("Number of cycles fetch has spent blocked") + .prereq(fetchBlockedCycles); + fetchedCacheLines + .name(name() + ".fetchedCacheLines") + .desc("Number of cache lines fetched") + .prereq(fetchedCacheLines); + + fetch_nisn_dist + .init(/* base value */ 0, + /* last value */ fetchWidth, + /* bucket size */ 1) + .name(name() + ".FETCH:rate_dist") + .desc("Number of instructions fetched each cycle (Total)") + .flags(Stats::pdf) + ; + + branchPred.regStats(); } template<class Impl> @@ -122,19 +169,40 @@ SimpleFetch<Impl>::processCacheCompletion() _status = IcacheMissComplete; } -template<class Impl> +#if 0 +template <class Impl> +inline void +SimpleFetch<Impl>::recordGlobalHist(DynInstPtr &inst) +{ + inst->setGlobalHist(branchPred.BPReadGlobalHist()); +} +#endif + +template <class Impl> bool -SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC) +SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC) { -#if 1 // Do branch prediction check here. - bool predict_taken = branchPred.BPLookup(next_PC); - Addr predict_target; + // A bit of a misnomer...next_PC is actually the current PC until + // this function updates it. + bool predict_taken; + + if (!inst->isControl()) { + next_PC = next_PC + instSize; + inst->setPredTarg(next_PC); + return false; + } + + predict_taken = branchPred.predict(inst, next_PC); + +#if 0 + predict_taken = branchPred.BPLookup(next_PC) DPRINTF(Fetch, "Fetch: Branch predictor predicts taken? %i\n", predict_taken); - if (branchPred.BTBValid(next_PC)) { + // Only check the BTB if the BP has predicted taken. + if (predict_taken && branchPred.BTBValid(next_PC)) { predict_target = branchPred.BTBLookup(next_PC); DPRINTF(Fetch, "Fetch: BTB target is %#x.\n", predict_target); } else { @@ -142,42 +210,135 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC) DPRINTF(Fetch, "Fetch: BTB does not have a valid entry.\n"); } - // Now update the PC to fetch the next instruction in the cache - // line. - if (!predict_taken) { - next_PC = next_PC + instSize; - return false; - } else { - next_PC = predict_target; - return true; - } #endif + if (predict_taken) { + ++predictedBranches; + } -#if 0 - next_PC = next_PC + instSize; - return false; -#endif + return predict_taken; } -template<class Impl> -void -SimpleFetch<Impl>::squash(Addr new_PC) +template <class Impl> +Fault +SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC) +{ + // Check if the instruction exists within the cache. + // If it does, then proceed on to read the instruction and the rest + // of the instructions in the cache line until either the end of the + // cache line or a predicted taken branch is encountered. + +#ifdef FULL_SYSTEM + // Flag to say whether or not address is physical addr. + unsigned flags = cpu->inPalMode() ? PHYSICAL : 0; +#else + unsigned flags = 0; +#endif // FULL_SYSTEM + + Fault fault = No_Fault; + + // Align the fetch PC so it's at the start of a cache block. + fetch_PC = icacheBlockAlignPC(fetch_PC); + + // Setup the memReq to do a read of the first isntruction's address. + // Set the appropriate read size and flags as well. + memReq->cmd = Read; + memReq->reset(fetch_PC, cacheBlkSize, flags); + + // Translate the instruction request. + // Should this function be + // in the CPU class ? Probably...ITB/DTB should exist within the + // CPU. + + fault = cpu->translateInstReq(memReq); + + // In the case of faults, the fetch stage may need to stall and wait + // on what caused the fetch (ITB or Icache miss). + + // If translation was successful, attempt to read the first + // instruction. + if (fault == No_Fault) { + DPRINTF(Fetch, "Fetch: Doing instruction read.\n"); + fault = cpu->mem->read(memReq, cacheData); + // This read may change when the mem interface changes. + + fetchedCacheLines++; + } + + // Now do the timing access to see whether or not the instruction + // exists within the cache. + if (icacheInterface && fault == No_Fault) { + DPRINTF(Fetch, "Fetch: Doing timing memory access.\n"); + memReq->completionEvent = NULL; + + memReq->time = curTick; + + MemAccessResult result = icacheInterface->access(memReq); + + // If the cache missed (in this model functional and timing + // memories are different), then schedule an event to wake + // up this stage once the cache miss completes. + if (result != MA_HIT && icacheInterface->doEvents()) { + memReq->completionEvent = new CacheCompletionEvent(this); +// lastIcacheStall = curTick; + + // How does current model work as far as individual + // stages scheduling/unscheduling? + // Perhaps have only the main CPU scheduled/unscheduled, + // and have it choose what stages to run appropriately. + + DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n"); + _status = IcacheMissStall; + } + } + + return fault; +} + +template <class Impl> +inline void +SimpleFetch<Impl>::doSquash(const Addr &new_PC) { DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC); cpu->setNextPC(new_PC + instSize); cpu->setPC(new_PC); - _status = Squashing; - // Clear the icache miss if it's outstanding. if (_status == IcacheMissStall && icacheInterface) { + DPRINTF(Fetch, "Fetch: Squashing outstanding Icache miss.\n"); // @todo: Use an actual thread number here. icacheInterface->squash(0); } - // Tell the CPU to remove any instructions that aren't currently - // in the ROB (instructions in flight that were killed). + _status = Squashing; + + ++fetchSquashCycles; +} + +template<class Impl> +void +SimpleFetch<Impl>::squashFromDecode(const Addr &new_PC, + const InstSeqNum &seq_num) +{ + DPRINTF(Fetch, "Fetch: Squashing from decode.\n"); + + doSquash(new_PC); + + // Tell the CPU to remove any instructions that are in flight between + // fetch and decode. + cpu->removeInstsUntil(seq_num); + +} + +template <class Impl> +void +SimpleFetch<Impl>::squash(const Addr &new_PC) +{ + DPRINTF(Fetch, "Fetch: Squash from commit.\n"); + + doSquash(new_PC); + + // Tell the CPU to remove any instructions that are not in the ROB. cpu->removeInstsNotInROB(); } @@ -185,7 +346,6 @@ template<class Impl> void SimpleFetch<Impl>::tick() { -#if 1 // Check squash signals from commit. if (fromCommit->commitInfo.squash) { DPRINTF(Fetch, "Fetch: Squashing instructions due to squash " @@ -196,13 +356,18 @@ SimpleFetch<Impl>::tick() // Also check if there's a mispredict that happened. if (fromCommit->commitInfo.branchMispredict) { - branchPred.BPUpdate(fromCommit->commitInfo.mispredPC, - fromCommit->commitInfo.branchTaken); - branchPred.BTBUpdate(fromCommit->commitInfo.mispredPC, - fromCommit->commitInfo.nextPC); + branchPred.squash(fromCommit->commitInfo.doneSeqNum, + fromCommit->commitInfo.nextPC, + fromCommit->commitInfo.branchTaken); + } else { + branchPred.squash(fromCommit->commitInfo.doneSeqNum); } return; + } else if (fromCommit->commitInfo.doneSeqNum) { + // Update the branch predictor if it wasn't a squashed instruction + // that was braodcasted. + branchPred.update(fromCommit->commitInfo.doneSeqNum); } // Check ROB squash signals from commit. @@ -211,6 +376,8 @@ SimpleFetch<Impl>::tick() // Continue to squash. _status = Squashing; + + ++fetchSquashCycles; return; } @@ -220,22 +387,22 @@ SimpleFetch<Impl>::tick() "from decode.\n"); // Update the branch predictor. - if (fromCommit->decodeInfo.branchMispredict) { - branchPred.BPUpdate(fromDecode->decodeInfo.mispredPC, - fromDecode->decodeInfo.branchTaken); - branchPred.BTBUpdate(fromDecode->decodeInfo.mispredPC, - fromDecode->decodeInfo.nextPC); + if (fromDecode->decodeInfo.branchMispredict) { + branchPred.squash(fromDecode->decodeInfo.doneSeqNum, + fromDecode->decodeInfo.nextPC, + fromDecode->decodeInfo.branchTaken); + } else { + branchPred.squash(fromDecode->decodeInfo.doneSeqNum); } if (_status != Squashing) { // Squash unless we're already squashing? - squash(fromDecode->decodeInfo.nextPC); + squashFromDecode(fromDecode->decodeInfo.nextPC, + fromDecode->decodeInfo.doneSeqNum); return; } } - - // Check if any of the stall signals are high. if (fromDecode->decodeInfo.stall || fromRename->renameInfo.stall || @@ -253,12 +420,15 @@ SimpleFetch<Impl>::tick() fromCommit->commitInfo.stall); _status = Blocked; + + ++fetchBlockedCycles; return; } else if (_status == Blocked) { // Unblock stage if status is currently blocked and none of the // stall signals are being held high. _status = Running; + ++fetchBlockedCycles; return; } @@ -273,74 +443,15 @@ SimpleFetch<Impl>::tick() // Switch status to running _status = Running; + + ++fetchSquashCycles; } else if (_status != IcacheMissStall) { DPRINTF(Fetch, "Fetch: Running stage.\n"); - fetch(); - } -#endif - -#if 0 - if (_status != Blocked && - _status != Squashing && - _status != IcacheMissStall) { - DPRINTF(Fetch, "Fetch: Running stage.\n"); + ++fetchCycles; fetch(); - } else if (_status == Blocked) { - // If still being told to stall, do nothing. - if (fromDecode->decodeInfo.stall || - fromRename->renameInfo.stall || - fromIEW->iewInfo.stall || - fromCommit->commitInfo.stall) - { - DPRINTF(Fetch, "Fetch: Stalling stage.\n"); - DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i " - "Commit: %i\n", - fromDecode->decodeInfo.stall, - fromRename->renameInfo.stall, - fromIEW->iewInfo.stall, - fromCommit->commitInfo.stall); - } else { - - DPRINTF(Fetch, "Fetch: Done blocking.\n"); - _status = Running; - } - - if (fromCommit->commitInfo.squash) { - DPRINTF(Fetch, "Fetch: Squashing instructions due to squash " - "from commit.\n"); - squash(fromCommit->commitInfo.nextPC); - return; - } else if (fromDecode->decodeInfo.squash) { - DPRINTF(Fetch, "Fetch: Squashing instructions due to squash " - "from decode.\n"); - squash(fromDecode->decodeInfo.nextPC); - return; - } else if (fromCommit->commitInfo.robSquashing) { - DPRINTF(Fetch, "Fetch: ROB is still squashing.\n"); - _status = Squashing; - return; - } - } else if (_status == Squashing) { - // If there are no squash signals then change back to running. - // Note that when a squash starts happening, commitInfo.squash will - // be high. But if the squash is still in progress, then only - // commitInfo.robSquashing will be high. - if (!fromCommit->commitInfo.squash && - !fromCommit->commitInfo.robSquashing) { - - DPRINTF(Fetch, "Fetch: Done squashing.\n"); - _status = Running; - } else if (fromCommit->commitInfo.squash) { - // If there's a new squash, then start squashing again. - squash(fromCommit->commitInfo.nextPC); - } else { - // Purely a debugging statement. - DPRINTF(Fetch, "Fetch: ROB still squashing.\n"); - } } -#endif } template<class Impl> @@ -351,13 +462,6 @@ SimpleFetch<Impl>::fetch() // Start actual fetch ////////////////////////////////////////// -#ifdef FULL_SYSTEM - // Flag to say whether or not address is physical addr. - unsigned flags = cpu->inPalMode() ? PHYSICAL : 0; -#else - unsigned flags = 0; -#endif // FULL_SYSTEM - // The current PC. Addr fetch_PC = cpu->readPC(); @@ -379,64 +483,14 @@ SimpleFetch<Impl>::fetch() "instruction, starting at PC %08p.\n", fetch_PC); - // Otherwise check if the instruction exists within the cache. - // If it does, then proceed on to read the instruction and the rest - // of the instructions in the cache line until either the end of the - // cache line or a predicted taken branch is encountered. - // Note that this simply checks if the first instruction exists - // within the cache, assuming the rest of the cache line also exists - // within the cache. - - // Setup the memReq to do a read of the first isntruction's address. - // Set the appropriate read size and flags as well. - memReq->cmd = Read; - memReq->reset(fetch_PC, instSize, flags); - - // Translate the instruction request. - // Should this function be - // in the CPU class ? Probably...ITB/DTB should exist within the - // CPU. - - fault = cpu->translateInstReq(memReq); - - // In the case of faults, the fetch stage may need to stall and wait - // on what caused the fetch (ITB or Icache miss). - - // If translation was successful, attempt to read the first - // instruction. - if (fault == No_Fault) { - DPRINTF(Fetch, "Fetch: Doing instruction read.\n"); - fault = cpu->mem->read(memReq, inst); - // This read may change when the mem interface changes. - } - - // Now do the timing access to see whether or not the instruction - // exists within the cache. - if (icacheInterface && fault == No_Fault) { - DPRINTF(Fetch, "Fetch: Doing timing memory access.\n"); - memReq->completionEvent = NULL; - - memReq->time = curTick; - - MemAccessResult result = icacheInterface->access(memReq); - - // If the cache missed (in this model functional and timing - // memories are different), then schedule an event to wake - // up this stage once the cache miss completes. - if (result != MA_HIT && icacheInterface->doEvents()) { - memReq->completionEvent = &cacheCompletionEvent; -// lastIcacheStall = curTick; - - // How does current model work as far as individual - // stages scheduling/unscheduling? - // Perhaps have only the main CPU scheduled/unscheduled, - // and have it choose what stages to run appropriately. + fault = fetchCacheLine(fetch_PC); + } - DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n"); - _status = IcacheMissStall; - return; - } - } + // If we had a stall due to an icache miss, then return. It'd + // be nicer if this were handled through the kind of fault that + // is returned by the function. + if (_status == IcacheMissStall) { + return; } // As far as timing goes, the CPU will need to send an event through @@ -446,11 +500,15 @@ SimpleFetch<Impl>::fetch() Addr next_PC = fetch_PC; InstSeqNum inst_seq; + MachInst inst; + unsigned offset = fetch_PC & cacheBlkMask; + unsigned fetched; - // If the read of the first instruction was successful, then grab the - // instructions from the rest of the cache line and put them into the - // queue heading to decode. if (fault == No_Fault) { + // If the read of the first instruction was successful, then grab the + // instructions from the rest of the cache line and put them into the + // queue heading to decode. + DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n"); ////////////////////////// @@ -461,124 +519,59 @@ SimpleFetch<Impl>::fetch() // ended this fetch block. bool predicted_branch = false; - // Might want to keep track of various stats. -// numLinesFetched++; - - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); - - // Update the next PC; it either is PC+sizeof(MachInst), or - // branch_target. Check whether or not a branch was taken. - predicted_branch = lookupAndUpdateNextPC(next_PC); - - // Because the first instruction was already fetched, create the - // DynInst and put it into the queue to decode. - DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC, - inst_seq, cpu); - - DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n", - inst_seq, instruction->readPC()); - DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n", - OPCODE(inst)); - - instruction->traceData = - Trace::getInstRecord(curTick, cpu->xcBase(), cpu, - instruction->staticInst, - instruction->readPC(), 0); - - cpu->addInst(instruction); - - // Write the instruction to the first slot in the queue - // that heads to decode. - toDecode->insts[0] = instruction; - - toDecode->size++; - - fetch_PC = next_PC; - - ////////////////////////// - // Fetch other instructions - ////////////////////////// - - // Obtain the index into the cache line by getting only the low - // order bits. Will need to do shifting as well. - int line_index = fetch_PC & cacheBlockMask; - - // Take instructions and put them into the queue heading to decode. - // Then read the next instruction in the cache line. Continue - // until either all of the fetch bandwidth is used (not an issue for - // non-SMT), or the end of the cache line is reached. Note that - // this assumes standard cachelines, and not something like a trace - // cache where lines might not end at cache-line size aligned - // addresses. - // @todo: Fix the horrible amount of translates/reads that must - // take place due to reading an entire cacheline. Ideally it - // should all take place at once, return an array of binary - // instructions, which can then be used to get all the instructions - // needed. Figure out if I can roll it back into one loop. - for (int fetched = 1; - line_index < blkSize && + for (fetched = 0; + offset < cacheBlkSize && fetched < fetchWidth && !predicted_branch; - line_index+=instSize, ++fetched) + ++fetched) { - // Reset the mem request to setup the read of the next - // instruction. - memReq->reset(fetch_PC, instSize, flags); - // Translate the instruction request. - fault = cpu->translateInstReq(memReq); + // Get a sequence number. + inst_seq = cpu->getAndIncrementInstSeq(); - // Read instruction. - if (fault == No_Fault) { - fault = cpu->mem->read(memReq, inst); - } + // Make sure this is a valid index. + assert(offset <= cacheBlkSize - instSize); - // Check if there was a fault. - if (fault != No_Fault) { - panic("Fetch: Read of instruction faulted when it should " - "succeed; most likely exceeding cache line.\n"); - } + // Get the instruction from the array of the cache line. + inst = htoa(*reinterpret_cast<MachInst *> + (&cacheData[offset])); - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); + // Create a new DynInst from the instruction fetched. + DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC, + inst_seq, cpu); - predicted_branch = lookupAndUpdateNextPC(next_PC); + DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n", + inst_seq, instruction->readPC()); - // Create the actual DynInst. Parameters are: - // DynInst(instruction, PC, predicted PC, CPU pointer). - // Because this simple model has no branch prediction, the - // predicted PC will simply be PC+sizeof(MachInst). - // Update to actually use a branch predictor to predict the - // target in the future. - DynInstPtr instruction = - new DynInst(inst, fetch_PC, next_PC, inst_seq, cpu); + DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n", + OPCODE(inst)); instruction->traceData = Trace::getInstRecord(curTick, cpu->xcBase(), cpu, instruction->staticInst, instruction->readPC(), 0); - DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n", - inst_seq, instruction->readPC()); - DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n", - OPCODE(inst)); + predicted_branch = lookupAndUpdateNextPC(instruction, next_PC); + // Add instruction to the CPU's list of instructions. cpu->addInst(instruction); - // Write the instruction to the proper slot in the queue + // Write the instruction to the first slot in the queue // that heads to decode. toDecode->insts[fetched] = instruction; toDecode->size++; - // Might want to keep track of various stats. -// numInstsFetched++; + // Increment stat of fetched instructions. + ++fetchedInsts; - // Update the PC with the next PC. + // Move to the next instruction, unless we have a branch. fetch_PC = next_PC; + + offset+= instSize; } + fetch_nisn_dist.sample(fetched); } // Now that fetching is completed, update the PC to signify what the next @@ -592,6 +585,12 @@ SimpleFetch<Impl>::fetch() cpu->setPC(next_PC); cpu->setNextPC(next_PC + instSize); } else { + // If the issue was an icache miss, then we can just return and + // wait until it is handled. + if (_status == IcacheMissStall) { + return; + } + // Handle the fault. // This stage will not be able to continue until all the ROB // slots are empty, at which point the fault can be handled. diff --git a/cpu/beta_cpu/free_list.hh b/cpu/beta_cpu/free_list.hh index 0d2b2c421..e8e75f7ec 100644 --- a/cpu/beta_cpu/free_list.hh +++ b/cpu/beta_cpu/free_list.hh @@ -6,11 +6,9 @@ #include "arch/alpha/isa_traits.hh" #include "cpu/beta_cpu/comm.hh" +#include "base/traceflags.hh" #include "base/trace.hh" -// Question: Do I even need the number of logical registers? -// How to avoid freeing registers instantly? Same with ROB entries. - /** * FreeList class that simply holds the list of free integer and floating * point registers. Can request for a free register of either type, and @@ -153,8 +151,6 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg) assert(!freeIntRegsScoreboard[freed_reg]); freeIntRegsScoreboard[freed_reg] = 1; - //Might want to add in a check for whether or not this register is - //already in there. A bit vector or something similar would be useful. freeIntRegs.push(freed_reg); } @@ -167,8 +163,6 @@ SimpleFreeList::addFloatReg(PhysRegIndex freed_reg) assert(!freeFloatRegsScoreboard[freed_reg]); freeFloatRegsScoreboard[freed_reg] = 1; - //Might want to add in a check for whether or not this register is - //already in there. A bit vector or something similar would be useful. freeFloatRegs.push(freed_reg); } diff --git a/cpu/beta_cpu/full_cpu.cc b/cpu/beta_cpu/full_cpu.cc index abeb4cb87..d5228601c 100644 --- a/cpu/beta_cpu/full_cpu.cc +++ b/cpu/beta_cpu/full_cpu.cc @@ -168,6 +168,13 @@ FullBetaCPU<Impl>::~FullBetaCPU() template <class Impl> void +FullBetaCPU<Impl>::fullCPURegStats() +{ + // Register any of the FullCPU's stats here. +} + +template <class Impl> +void FullBetaCPU<Impl>::tick() { DPRINTF(FullCPU, "\n\nFullCPU: Ticking main, FullBetaCPU.\n"); @@ -424,19 +431,17 @@ template <class Impl> void FullBetaCPU<Impl>::removeFrontInst(DynInstPtr &inst) { - DynInstPtr inst_to_delete; + DynInstPtr inst_to_remove; - // The front instruction should be the same one being asked to be deleted. + // The front instruction should be the same one being asked to be removed. assert(instList.front() == inst); // Remove the front instruction. - inst_to_delete = inst; + inst_to_remove = inst; instList.pop_front(); - DPRINTF(FullCPU, "FullCPU: Deleting committed instruction %#x, PC %#x\n", - inst_to_delete, inst_to_delete->readPC()); - -// delete inst_to_delete; + DPRINTF(FullCPU, "FullCPU: Removing committed instruction %#x, PC %#x\n", + inst_to_remove, inst_to_remove->readPC()); } template <class Impl> @@ -453,6 +458,33 @@ FullBetaCPU<Impl>::removeInstsNotInROB() template <class Impl> void +FullBetaCPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num) +{ + DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction " + "list.\n"); + + DynInstPtr inst_to_delete; + + while (instList.back()->seqNum > seq_num) { + assert(!instList.empty()); + + // Obtain the pointer to the instruction. + inst_to_delete = instList.back(); + + DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n", + inst_to_delete->seqNum, inst_to_delete->readPC()); + + // Remove the instruction from the list. + instList.pop_back(); + + // Mark it as squashed. + inst_to_delete->setSquashed(); + } + +} + +template <class Impl> +void FullBetaCPU<Impl>::removeAllInsts() { instList.clear(); diff --git a/cpu/beta_cpu/full_cpu.hh b/cpu/beta_cpu/full_cpu.hh index cf753ad67..bde7e5bbf 100644 --- a/cpu/beta_cpu/full_cpu.hh +++ b/cpu/beta_cpu/full_cpu.hh @@ -115,6 +115,8 @@ class FullBetaCPU : public BaseFullCPU void init(); + void fullCPURegStats(); + void activateContext(int thread_num, int delay); void suspendContext(int thread_num); void deallocateContext(int thread_num); @@ -205,6 +207,9 @@ class FullBetaCPU : public BaseFullCPU /** Remove all instructions that are not currently in the ROB. */ void removeInstsNotInROB(); + /** Remove all instructions younger than the given sequence number. */ + void removeInstsUntil(const InstSeqNum &seq_num); + /** Remove all instructions from the list. */ void removeAllInsts(); diff --git a/cpu/beta_cpu/iew.hh b/cpu/beta_cpu/iew.hh index de408ef0c..90bd39e7f 100644 --- a/cpu/beta_cpu/iew.hh +++ b/cpu/beta_cpu/iew.hh @@ -9,6 +9,7 @@ #include "base/timebuf.hh" #include "cpu/beta_cpu/comm.hh" +#include "base/statistics.hh" //Can IEW even stall? Space should be available/allocated already...maybe //if there's not enough write ports on the ROB or waiting for CDB @@ -50,7 +51,9 @@ class SimpleIEW public: void squash(); - void squash(DynInstPtr &inst); + void squashDueToBranch(DynInstPtr &inst); + + void squashDueToMem(DynInstPtr &inst); void block(); @@ -59,6 +62,8 @@ class SimpleIEW public: SimpleIEW(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu_ptr); void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); @@ -76,6 +81,10 @@ class SimpleIEW void iew(); private: + void dispatchInsts(); + + void executeInsts(); + //Interfaces to objects inside and outside of IEW. /** Time buffer interface. */ TimeBuffer<TimeStruct> *timeBuffer; @@ -159,9 +168,23 @@ class SimpleIEW */ unsigned cyclesSquashing; - //Will implement later - //Load queue interface (probably one and the same) - //Store queue interface + Stats::Scalar<> iewIdleCycles; + Stats::Scalar<> iewSquashCycles; + Stats::Scalar<> iewBlockCycles; + Stats::Scalar<> iewUnblockCycles; +// Stats::Scalar<> iewWBInsts; + Stats::Scalar<> iewDispatchedInsts; + Stats::Scalar<> iewDispSquashedInsts; + Stats::Scalar<> iewDispLoadInsts; + Stats::Scalar<> iewDispStoreInsts; + Stats::Scalar<> iewDispNonSpecInsts; + Stats::Scalar<> iewIQFullEvents; + Stats::Scalar<> iewExecutedInsts; + Stats::Scalar<> iewExecLoadInsts; + Stats::Scalar<> iewExecStoreInsts; + Stats::Scalar<> iewExecSquashedInsts; + Stats::Scalar<> memOrderViolationEvents; + Stats::Scalar<> predictedTakenIncorrect; }; #endif diff --git a/cpu/beta_cpu/iew_impl.hh b/cpu/beta_cpu/iew_impl.hh index 521ce77f6..2bfd6bae9 100644 --- a/cpu/beta_cpu/iew_impl.hh +++ b/cpu/beta_cpu/iew_impl.hh @@ -38,6 +38,79 @@ SimpleIEW<Impl, IQ>::SimpleIEW(Params ¶ms) instQueue.setIssueToExecuteQueue(&issueToExecQueue); } +template <class Impl, class IQ> +void +SimpleIEW<Impl, IQ>::regStats() +{ + instQueue.regStats(); + + iewIdleCycles + .name(name() + ".iewIdleCycles") + .desc("Number of cycles IEW is idle"); + + iewSquashCycles + .name(name() + ".iewSquashCycles") + .desc("Number of cycles IEW is squashing"); + + iewBlockCycles + .name(name() + ".iewBlockCycles") + .desc("Number of cycles IEW is blocking"); + + iewUnblockCycles + .name(name() + ".iewUnblockCycles") + .desc("Number of cycles IEW is unblocking"); + +// iewWBInsts; + + iewDispatchedInsts + .name(name() + ".iewDispatchedInsts") + .desc("Number of instructions dispatched to IQ"); + + iewDispSquashedInsts + .name(name() + ".iewDispSquashedInsts") + .desc("Number of squashed instructions skipped by dispatch"); + + iewDispLoadInsts + .name(name() + ".iewDispLoadInsts") + .desc("Number of dispatched load instructions"); + + iewDispStoreInsts + .name(name() + ".iewDispStoreInsts") + .desc("Number of dispatched store instructions"); + + iewDispNonSpecInsts + .name(name() + ".iewDispNonSpecInsts") + .desc("Number of dispatched non-speculative instructions"); + + iewIQFullEvents + .name(name() + ".iewIQFullEvents") + .desc("Number of times the IQ has become full, causing a stall"); + + iewExecutedInsts + .name(name() + ".iewExecutedInsts") + .desc("Number of executed instructions"); + + iewExecLoadInsts + .name(name() + ".iewExecLoadInsts") + .desc("Number of load instructions executed"); + + iewExecStoreInsts + .name(name() + ".iewExecStoreInsts") + .desc("Number of store instructions executed"); + + iewExecSquashedInsts + .name(name() + ".iewExecSquashedInsts") + .desc("Number of squashed instructions skipped in execute"); + + memOrderViolationEvents + .name(name() + ".memOrderViolationEvents") + .desc("Number of memory order violations"); + + predictedTakenIncorrect + .name(name() + ".predictedTakenIncorrect") + .desc("Number of branches that were predicted taken incorrectly"); +} + template<class Impl, class IQ> void SimpleIEW<Impl, IQ>::setCPU(FullCPU *cpu_ptr) @@ -158,7 +231,7 @@ SimpleIEW<Impl, IQ>::squash() template<class Impl, class IQ> void -SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst) +SimpleIEW<Impl, IQ>::squashDueToBranch(DynInstPtr &inst) { DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n", inst->PC); @@ -167,119 +240,39 @@ SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst) _status = Squashing; // Tell rename to squash through the time buffer. - toRename->iewInfo.squash = true; + toCommit->squash = true; // Also send PC update information back to prior stages. - toRename->iewInfo.squashedSeqNum = inst->seqNum; - toRename->iewInfo.mispredPC = inst->readPC(); - toRename->iewInfo.nextPC = inst->readCalcTarg(); - toRename->iewInfo.branchMispredict = true; + toCommit->squashedSeqNum = inst->seqNum; + toCommit->mispredPC = inst->readPC(); + toCommit->nextPC = inst->readCalcTarg(); + toCommit->branchMispredict = true; // Prediction was incorrect, so send back inverse. - toRename->iewInfo.branchTaken = !(inst->predTaken()); + toCommit->branchTaken = inst->readCalcTarg() != + (inst->readPC() + sizeof(MachInst)); +// toCommit->globalHist = inst->readGlobalHist(); } template<class Impl, class IQ> void -SimpleIEW<Impl, IQ>::tick() +SimpleIEW<Impl, IQ>::squashDueToMem(DynInstPtr &inst) { - // Considering putting all the state-determining stuff in this section. - - // Try to fill up issue queue with as many instructions as bandwidth - // allows. - // Decode should try to execute as many instructions as its bandwidth - // will allow, as long as it is not currently blocked. - - // Check if the stage is in a running status. - if (_status != Blocked && _status != Squashing) { - DPRINTF(IEW, "IEW: Status is not blocked, attempting to run " - "stage.\n"); - iew(); - - // If it's currently unblocking, check to see if it should switch - // to running. - if (_status == Unblocking) { - unblock(); - } - } else if (_status == Squashing) { - - DPRINTF(IEW, "IEW: Still squashing.\n"); - - // Check if stage should remain squashing. Stop squashing if the - // squash signal clears. - if (!fromCommit->commitInfo.squash && - !fromCommit->commitInfo.robSquashing) { - DPRINTF(IEW, "IEW: Done squashing, changing status to " - "running.\n"); - - _status = Running; - instQueue.stopSquash(); - } else { - instQueue.doSquash(); - } - - // Also should advance its own time buffers if the stage ran. - // Not sure about this... -// issueToExecQueue.advance(); - } else if (_status == Blocked) { - // Continue to tell previous stage to stall. - toRename->iewInfo.stall = true; - - // Check if possible stall conditions have cleared. - if (!fromCommit->commitInfo.stall && - !instQueue.isFull()) { - DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n"); - _status = Unblocking; - } - - // If there's still instructions coming from rename, continue to - // put them on the skid buffer. - if (fromRename->insts[0]) { - block(); - } - - if (fromCommit->commitInfo.squash || - fromCommit->commitInfo.robSquashing) { - squash(); - } - } - - // @todo: Maybe put these at the beginning, so if it's idle it can - // return early. - // Write back number of free IQ entries here. - toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries(); - - // Check the committed load/store signals to see if there's a load - // or store to commit. Also check if it's being told to execute a - // nonspeculative instruction. - if (fromCommit->commitInfo.commitIsStore) { - ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum); - } else if (fromCommit->commitInfo.commitIsLoad) { - ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum); - } - - if (fromCommit->commitInfo.nonSpecSeqNum != 0) { - instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum); - } + DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n", + inst->PC); + // Perhaps leave the squashing up to the ROB stage to tell it when to + // squash? + _status = Squashing; - DPRINTF(IEW, "IEW: IQ has %i free entries.\n", - instQueue.numFreeEntries()); + // Tell rename to squash through the time buffer. + toCommit->squash = true; + // Also send PC update information back to prior stages. + toCommit->squashedSeqNum = inst->seqNum; + toCommit->nextPC = inst->readCalcTarg(); } -template<class Impl, class IQ> +template <class Impl, class IQ> void -SimpleIEW<Impl, IQ>::iew() +SimpleIEW<Impl, IQ>::dispatchInsts() { - // Might want to put all state checks in the tick() function. - // Check if being told to stall from commit. - if (fromCommit->commitInfo.stall) { - block(); - return; - } else if (fromCommit->commitInfo.squash || - fromCommit->commitInfo.robSquashing) { - // Also check if commit is telling this stage to squash. - squash(); - return; - } - //////////////////////////////////////// // DISPATCH/ISSUE stage //////////////////////////////////////// @@ -289,29 +282,36 @@ SimpleIEW<Impl, IQ>::iew() // Check if there are any instructions coming from rename, and we're. // not squashing. - if (fromRename->insts[0] && _status != Squashing) { + if (fromRename->size > 0) { + int insts_to_add = fromRename->size; // Loop through the instructions, putting them in the instruction // queue. - for (int inst_num = 0; inst_num < issueReadWidth; ++inst_num) + for (int inst_num = 0; inst_num < insts_to_add; ++inst_num) { DynInstPtr inst = fromRename->insts[inst_num]; // Make sure there's a valid instruction there. - if (!inst) - break; + assert(inst); DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n", inst->readPC()); - // If it's a memory reference, don't put it in the - // instruction queue. These will only be executed at commit. - // Do the same for nonspeculative instructions and nops. // Be sure to mark these instructions as ready so that the // commit stage can go ahead and execute them, and mark // them as issued so the IQ doesn't reprocess them. if (inst->isSquashed()) { + ++iewDispSquashedInsts; continue; + } else if (instQueue.isFull()) { + DPRINTF(IEW, "IEW: Issue: IQ has become full.\n"); + // Call function to start blocking. + block(); + // Tell previous stage to stall. + toRename->iewInfo.stall = true; + + ++iewIQFullEvents; + break; } else if (inst->isLoad()) { DPRINTF(IEW, "IEW: Issue: Memory instruction " "encountered, adding to LDSTQ.\n"); @@ -320,6 +320,7 @@ SimpleIEW<Impl, IQ>::iew() // memory access. ldstQueue.insertLoad(inst); + ++iewDispLoadInsts; } else if (inst->isStore()) { ldstQueue.insertStore(inst); @@ -327,10 +328,15 @@ SimpleIEW<Impl, IQ>::iew() // the commit stage will try committing it, and then // once commit realizes it's a store it will send back // a signal to this stage to issue and execute that - // store. + // store. Change to be a bit that says the instruction + // has extra work to do at commit. inst->setCanCommit(); instQueue.insertNonSpec(inst); + + ++iewDispStoreInsts; + ++iewDispNonSpecInsts; + continue; } else if (inst->isNonSpeculative()) { DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction " @@ -342,6 +348,8 @@ SimpleIEW<Impl, IQ>::iew() // Specificall insert it as nonspeculative. instQueue.insertNonSpec(inst); + ++iewDispNonSpecInsts; + continue; } else if (inst->isNop()) { DPRINTF(IEW, "IEW: Issue: Nop instruction encountered " @@ -352,25 +360,35 @@ SimpleIEW<Impl, IQ>::iew() inst->setCanCommit(); instQueue.advanceTail(inst); + + continue; + } else if (inst->isExecuted()) { + DPRINTF(IEW, "IEW: Issue: Executed branch encountered, " + "skipping.\n"); + + assert(inst->isDirectCtrl()); + + inst->setIssued(); + inst->setCanCommit(); + + instQueue.advanceTail(inst); + continue; - } else if (instQueue.isFull()) { - DPRINTF(IEW, "IEW: Issue: IQ has become full.\n"); - // Call function to start blocking. - block(); - // Tell previous stage to stall. - toRename->iewInfo.stall = true; - break; } // If the instruction queue is not full, then add the // instruction. instQueue.insert(fromRename->insts[inst_num]); + + ++iewDispatchedInsts; } } +} - // Have the instruction queue try to schedule any ready instructions. - instQueue.scheduleReadyInsts(); - +template <class Impl, class IQ> +void +SimpleIEW<Impl, IQ>::executeInsts() +{ //////////////////////////////////////// //EXECUTE/WRITEBACK stage //////////////////////////////////////// @@ -389,9 +407,10 @@ SimpleIEW<Impl, IQ>::iew() // Execute/writeback any instructions that are available. for (int inst_num = 0; fu_usage < executeWidth && /* Haven't exceeded available FU's. */ - inst_num < issueWidth && /* Haven't exceeded issue width. */ - fromIssue->insts[inst_num]; /* There are available instructions. */ + inst_num < issueWidth && + fromIssue->insts[inst_num]; ++inst_num) { + DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n"); // Get instruction from issue's queue. @@ -410,6 +429,8 @@ SimpleIEW<Impl, IQ>::iew() toCommit->insts[inst_num] = inst; + ++iewExecSquashedInsts; + continue; } @@ -428,14 +449,20 @@ SimpleIEW<Impl, IQ>::iew() // Tell the LDSTQ to execute this instruction (if it is a load). if (inst->isLoad()) { ldstQueue.executeLoad(inst); + + ++iewExecLoadInsts; } else if (inst->isStore()) { ldstQueue.executeStore(); + + ++iewExecStoreInsts; } else { panic("IEW: Unexpected memory type!\n"); } } else { inst->execute(); + + ++iewExecutedInsts; } // First check the time slot that this instruction will write @@ -464,25 +491,148 @@ SimpleIEW<Impl, IQ>::iew() inst->nextPC); // If incorrect, then signal the ROB that it must be squashed. - squash(inst); + squashDueToBranch(inst); + + if (inst->predTaken()) { + predictedTakenIncorrect++; + } } else if (ldstQueue.violation()) { fetch_redirect = true; + // Get the DynInst that caused the violation. DynInstPtr violator = ldstQueue.getMemDepViolator(); DPRINTF(IEW, "IEW: LDSTQ detected a violation. Violator PC: " "%#x, inst PC: %#x. Addr is: %#x.\n", violator->readPC(), inst->readPC(), inst->physEffAddr); + // Tell the instruction queue that a violation has occured. instQueue.violation(inst, violator); - squash(inst); - // Otherwise check if there was a memory ordering violation. - // If there was, then signal ROB that it must be squashed. Also - // signal IQ that there was a violation. + // Squash. + squashDueToMem(inst); + + ++memOrderViolationEvents; } } } +} + +template<class Impl, class IQ> +void +SimpleIEW<Impl, IQ>::tick() +{ + // Considering putting all the state-determining stuff in this section. + + // Try to fill up issue queue with as many instructions as bandwidth + // allows. + // Decode should try to execute as many instructions as its bandwidth + // will allow, as long as it is not currently blocked. + + // Check if the stage is in a running status. + if (_status != Blocked && _status != Squashing) { + DPRINTF(IEW, "IEW: Status is not blocked, attempting to run " + "stage.\n"); + iew(); + + // If it's currently unblocking, check to see if it should switch + // to running. + if (_status == Unblocking) { + unblock(); + + ++iewUnblockCycles; + } + } else if (_status == Squashing) { + + DPRINTF(IEW, "IEW: Still squashing.\n"); + + // Check if stage should remain squashing. Stop squashing if the + // squash signal clears. + if (!fromCommit->commitInfo.squash && + !fromCommit->commitInfo.robSquashing) { + DPRINTF(IEW, "IEW: Done squashing, changing status to " + "running.\n"); + + _status = Running; + instQueue.stopSquash(); + } else { + instQueue.doSquash(); + } + + ++iewSquashCycles; + + // Also should advance its own time buffers if the stage ran. + // Not sure about this... +// issueToExecQueue.advance(); + } else if (_status == Blocked) { + // Continue to tell previous stage to stall. + toRename->iewInfo.stall = true; + + // Check if possible stall conditions have cleared. + if (!fromCommit->commitInfo.stall && + !instQueue.isFull()) { + DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n"); + _status = Unblocking; + } + + // If there's still instructions coming from rename, continue to + // put them on the skid buffer. + if (fromRename->size == 0) { + block(); + } + + if (fromCommit->commitInfo.squash || + fromCommit->commitInfo.robSquashing) { + squash(); + } + + ++iewBlockCycles; + } + + // @todo: Maybe put these at the beginning, so if it's idle it can + // return early. + // Write back number of free IQ entries here. + toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries(); + + // Check the committed load/store signals to see if there's a load + // or store to commit. Also check if it's being told to execute a + // nonspeculative instruction. + if (fromCommit->commitInfo.commitIsStore) { + ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum); + } else if (fromCommit->commitInfo.commitIsLoad) { + ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum); + } + + if (fromCommit->commitInfo.nonSpecSeqNum != 0) { + instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum); + } + + DPRINTF(IEW, "IEW: IQ has %i free entries.\n", + instQueue.numFreeEntries()); +} + +template<class Impl, class IQ> +void +SimpleIEW<Impl, IQ>::iew() +{ + // Might want to put all state checks in the tick() function. + // Check if being told to stall from commit. + if (fromCommit->commitInfo.stall) { + block(); + return; + } else if (fromCommit->commitInfo.squash || + fromCommit->commitInfo.robSquashing) { + // Also check if commit is telling this stage to squash. + squash(); + return; + } + + dispatchInsts(); + + // Have the instruction queue try to schedule any ready instructions. + instQueue.scheduleReadyInsts(); + + executeInsts(); // Loop through the head of the time buffer and wake any dependents. // These instructions are about to write back. In the simple model @@ -491,7 +641,7 @@ SimpleIEW<Impl, IQ>::iew() // Also mark scoreboard that this instruction is finally complete. // Either have IEW have direct access to rename map, or have this as // part of backwards communication. - for (int inst_num = 0; inst_num < executeWidth && + for (int inst_num = 0; inst_num < issueWidth && toCommit->insts[inst_num]; inst_num++) { DynInstPtr inst = toCommit->insts[inst_num]; diff --git a/cpu/beta_cpu/inst_queue.cc b/cpu/beta_cpu/inst_queue.cc index 43b0a4572..c4fd077bc 100644 --- a/cpu/beta_cpu/inst_queue.cc +++ b/cpu/beta_cpu/inst_queue.cc @@ -5,3 +5,6 @@ // Force instantiation of InstructionQueue. template InstructionQueue<AlphaSimpleImpl>; + +unsigned +InstructionQueue<AlphaSimpleImpl>::DependencyEntry::mem_alloc_counter = 0; diff --git a/cpu/beta_cpu/inst_queue.hh b/cpu/beta_cpu/inst_queue.hh index a170979cb..6fcce70a4 100644 --- a/cpu/beta_cpu/inst_queue.hh +++ b/cpu/beta_cpu/inst_queue.hh @@ -7,14 +7,10 @@ #include <stdint.h> #include <vector> +#include "base/statistics.hh" #include "base/timebuf.hh" #include "cpu/inst_seq.hh" -//Perhaps have a better separation between the data structure underlying -//and the actual algorithm. -//somewhat nasty to try to have a nice ordering. -// Consider moving to STL list or slist for the LL stuff. - /** * A standard instruction queue class. It holds instructions in an * array, holds the ordering of the instructions within a linked list, @@ -74,6 +70,8 @@ class InstructionQueue InstructionQueue(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu); void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue); @@ -98,6 +96,7 @@ class InstructionQueue void violation(DynInstPtr &store, DynInstPtr &faulting_load); + // Change this to take in the sequence number void squash(); void doSquash(); @@ -159,7 +158,7 @@ class InstructionQueue ReadyInstQueue readyBranchInsts; /** List of ready memory instructions. */ - ReadyInstQueue readyMemInsts; +// ReadyInstQueue readyMemInsts; /** List of ready miscellaneous instructions. */ ReadyInstQueue readyMiscInsts; @@ -228,9 +227,6 @@ class InstructionQueue /** The sequence number of the squashed instruction. */ InstSeqNum squashedSeqNum; - /** Iterator that points to the oldest instruction in the IQ. */ -// ListIt head; - /** Iterator that points to the youngest instruction in the IQ. */ ListIt tail; @@ -261,6 +257,9 @@ class InstructionQueue void insert(DynInstPtr &new_inst); void remove(DynInstPtr &inst_to_remove); + + // Debug variable, remove when done testing. + static unsigned mem_alloc_counter; }; /** Array of linked lists. Each linked list is a list of all the @@ -285,6 +284,25 @@ class InstructionQueue void dumpDependGraph(); void addIfReady(DynInstPtr &inst); + + Stats::Scalar<> iqInstsAdded; + Stats::Scalar<> iqNonSpecInstsAdded; +// Stats::Scalar<> iqIntInstsAdded; + Stats::Scalar<> iqIntInstsIssued; +// Stats::Scalar<> iqFloatInstsAdded; + Stats::Scalar<> iqFloatInstsIssued; +// Stats::Scalar<> iqBranchInstsAdded; + Stats::Scalar<> iqBranchInstsIssued; +// Stats::Scalar<> iqMemInstsAdded; + Stats::Scalar<> iqMemInstsIssued; +// Stats::Scalar<> iqMiscInstsAdded; + Stats::Scalar<> iqMiscInstsIssued; + Stats::Scalar<> iqSquashedInstsIssued; + Stats::Scalar<> iqLoopSquashStalls; + Stats::Scalar<> iqSquashedInstsExamined; + Stats::Scalar<> iqSquashedOperandsExamined; + Stats::Scalar<> iqSquashedNonSpecRemoved; + }; #endif //__INST_QUEUE_HH__ diff --git a/cpu/beta_cpu/inst_queue_impl.hh b/cpu/beta_cpu/inst_queue_impl.hh index 03e3fed33..c688181ed 100644 --- a/cpu/beta_cpu/inst_queue_impl.hh +++ b/cpu/beta_cpu/inst_queue_impl.hh @@ -24,15 +24,13 @@ InstructionQueue<Impl>::InstructionQueue(Params ¶ms) numEntries(params.numIQEntries), intWidth(params.executeIntWidth), floatWidth(params.executeFloatWidth), + branchWidth(params.executeBranchWidth), + memoryWidth(params.executeMemoryWidth), totalWidth(params.issueWidth), numPhysIntRegs(params.numPhysIntRegs), numPhysFloatRegs(params.numPhysFloatRegs), commitToIEWDelay(params.commitToIEWDelay) { - // HACK: HARDCODED NUMBER. REMOVE LATER AND ADD TO PARAMETER. - branchWidth = 1; - memoryWidth = 1; - DPRINTF(IQ, "IQ: Int width is %i.\n", params.executeIntWidth); // Initialize the number of free IQ entries. @@ -68,6 +66,87 @@ InstructionQueue<Impl>::InstructionQueue(Params ¶ms) template <class Impl> void +InstructionQueue<Impl>::regStats() +{ + iqInstsAdded + .name(name() + ".iqInstsAdded") + .desc("Number of instructions added to the IQ (excludes non-spec)") + .prereq(iqInstsAdded); + + iqNonSpecInstsAdded + .name(name() + ".iqNonSpecInstsAdded") + .desc("Number of non-speculative instructions added to the IQ") + .prereq(iqNonSpecInstsAdded); + +// iqIntInstsAdded; + + iqIntInstsIssued + .name(name() + ".iqIntInstsIssued") + .desc("Number of integer instructions issued") + .prereq(iqIntInstsIssued); + +// iqFloatInstsAdded; + + iqFloatInstsIssued + .name(name() + ".iqFloatInstsIssued") + .desc("Number of float instructions issued") + .prereq(iqFloatInstsIssued); + +// iqBranchInstsAdded; + + iqBranchInstsIssued + .name(name() + ".iqBranchInstsIssued") + .desc("Number of branch instructions issued") + .prereq(iqBranchInstsIssued); + +// iqMemInstsAdded; + + iqMemInstsIssued + .name(name() + ".iqMemInstsIssued") + .desc("Number of memory instructions issued") + .prereq(iqMemInstsIssued); + +// iqMiscInstsAdded; + + iqMiscInstsIssued + .name(name() + ".iqMiscInstsIssued") + .desc("Number of miscellaneous instructions issued") + .prereq(iqMiscInstsIssued); + + iqSquashedInstsIssued + .name(name() + ".iqSquashedInstsIssued") + .desc("Number of squashed instructions issued") + .prereq(iqSquashedInstsIssued); + + iqLoopSquashStalls + .name(name() + ".iqLoopSquashStalls") + .desc("Number of times issue loop had to restart due to squashed " + "inst; mainly for profiling") + .prereq(iqLoopSquashStalls); + + iqSquashedInstsExamined + .name(name() + ".iqSquashedInstsExamined") + .desc("Number of squashed instructions iterated over during squash;" + " mainly for profiling") + .prereq(iqSquashedInstsExamined); + + iqSquashedOperandsExamined + .name(name() + ".iqSquashedOperandsExamined") + .desc("Number of squashed operands that are examined and possibly " + "removed from graph") + .prereq(iqSquashedOperandsExamined); + + iqSquashedNonSpecRemoved + .name(name() + ".iqSquashedNonSpecRemoved") + .desc("Number of squashed non-spec instructions that were removed") + .prereq(iqSquashedNonSpecRemoved); + + // Tell mem dependence unit to reg stats as well. + memDepUnit.regStats(); +} + +template <class Impl> +void InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr) { cpu = cpu_ptr; @@ -161,10 +240,14 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst) // unit. if (new_inst->isMemRef()) { memDepUnit.insert(new_inst); + // Uh..forgot to look it up and put it on the proper dependency list + // if the instruction should not go yet. + } else { + // If the instruction is ready then add it to the ready list. + addIfReady(new_inst); } - // If the instruction is ready then add it to the ready list. - addIfReady(new_inst); + ++iqInstsAdded; assert(freeEntries == (numEntries - countInsts())); } @@ -219,13 +302,16 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst) // If it's a memory instruction, add it to the memory dependency // unit. if (inst->isMemRef()) { - memDepUnit.insert(inst); + memDepUnit.insertNonSpec(inst); } + + ++iqNonSpecInstsAdded; } // Slightly hack function to advance the tail iterator in the case that // the IEW stage issues an instruction that is not added to the IQ. This // is needed in case a long chain of such instructions occurs. +// I don't think this is used anymore. template <class Impl> void InstructionQueue<Impl>::advanceTail(DynInstPtr &inst) @@ -288,7 +374,7 @@ InstructionQueue<Impl>::scheduleReadyInsts() bool insts_available = !readyBranchInsts.empty() || !readyIntInsts.empty() || !readyFloatInsts.empty() || - !readyMemInsts.empty() || + !memDepUnit.empty() || !readyMiscInsts.empty() || !squashedInsts.empty(); @@ -327,6 +413,9 @@ InstructionQueue<Impl>::scheduleReadyInsts() if (int_head_inst->isSquashed()) { readyIntInsts.pop(); + + ++iqLoopSquashStalls; + continue; } @@ -344,6 +433,9 @@ InstructionQueue<Impl>::scheduleReadyInsts() if (float_head_inst->isSquashed()) { readyFloatInsts.pop(); + + ++iqLoopSquashStalls; + continue; } else if (float_head_inst->seqNum < oldest_inst) { oldest_inst = float_head_inst->seqNum; @@ -361,6 +453,9 @@ InstructionQueue<Impl>::scheduleReadyInsts() if (branch_head_inst->isSquashed()) { readyBranchInsts.pop(); + + ++iqLoopSquashStalls; + continue; } else if (branch_head_inst->seqNum < oldest_inst) { oldest_inst = branch_head_inst->seqNum; @@ -370,15 +465,18 @@ InstructionQueue<Impl>::scheduleReadyInsts() } - if (!readyMemInsts.empty() && + if (!memDepUnit.empty() && memory_issued < memoryWidth) { insts_available = true; - mem_head_inst = readyMemInsts.top(); + mem_head_inst = memDepUnit.top(); if (mem_head_inst->isSquashed()) { - readyMemInsts.pop(); + memDepUnit.pop(); + + ++iqLoopSquashStalls; + continue; } else if (mem_head_inst->seqNum < oldest_inst) { oldest_inst = mem_head_inst->seqNum; @@ -395,6 +493,9 @@ InstructionQueue<Impl>::scheduleReadyInsts() if (misc_head_inst->isSquashed()) { readyMiscInsts.pop(); + + ++iqLoopSquashStalls; + continue; } else if (misc_head_inst->seqNum < oldest_inst) { oldest_inst = misc_head_inst->seqNum; @@ -450,9 +551,7 @@ InstructionQueue<Impl>::scheduleReadyInsts() case Memory: issuing_inst = mem_head_inst; - memDepUnit.issue(mem_head_inst); - - readyMemInsts.pop(); + memDepUnit.pop(); ++memory_issued; DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n", issuing_inst->readPC()); @@ -461,6 +560,9 @@ InstructionQueue<Impl>::scheduleReadyInsts() case Misc: issuing_inst = misc_head_inst; readyMiscInsts.pop(); + + ++iqMiscInstsIssued; + DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n", issuing_inst->readPC()); break; @@ -476,6 +578,7 @@ InstructionQueue<Impl>::scheduleReadyInsts() if (list_with_oldest != None) { i2e_info->insts[total_issued] = issuing_inst; + i2e_info->size++; issuing_inst->setIssued(); @@ -485,12 +588,21 @@ InstructionQueue<Impl>::scheduleReadyInsts() assert(freeEntries == (numEntries - countInsts())); } + + iqIntInstsIssued += int_issued; + iqFloatInstsIssued += float_issued; + iqBranchInstsIssued += branch_issued; + iqMemInstsIssued += memory_issued; + iqSquashedInstsIssued += squashed_issued; } template <class Impl> void InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst) { + DPRINTF(IQ, "IQ: Marking nonspeculative instruction with sequence " + "number %i as ready to execute.\n", inst); + non_spec_it_t inst_it = nonSpecInsts.find(inst); assert(inst_it != nonSpecInsts.end()); @@ -499,7 +611,11 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst) (*inst_it).second->setCanIssue(); // Now schedule the instruction. - addIfReady((*inst_it).second); + if (!(*inst_it).second->isMemRef()) { + addIfReady((*inst_it).second); + } else { + memDepUnit.nonSpecInstReady((*inst_it).second); + } nonSpecInsts.erase(inst_it); } @@ -552,6 +668,7 @@ InstructionQueue<Impl>::doSquash() // hasn't already been squashed in the IQ. if (!squashed_inst->isIssued() && !squashed_inst->isSquashedInIQ()) { + // Remove the instruction from the dependency list. // Hack for now: These below don't add themselves to the // dependency list, so don't try to remove them. @@ -576,7 +693,15 @@ InstructionQueue<Impl>::doSquash() src_reg < numPhysRegs) { dependGraph[src_reg].remove(squashed_inst); } + + ++iqSquashedOperandsExamined; } + + // Might want to remove producers as well. + } else { + nonSpecInsts.erase(squashed_inst->seqNum); + + ++iqSquashedNonSpecRemoved; } // Might want to also clear out the head of the dependency graph. @@ -590,11 +715,8 @@ InstructionQueue<Impl>::doSquash() squashed_inst->readPC()); } - if (squashed_inst->isNonSpeculative() || squashed_inst->isStore()) { - nonSpecInsts.erase(squashed_inst->seqNum); - } - --squashIt; + ++iqSquashedInstsExamined; } } @@ -665,6 +787,8 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst) dependGraph[dest_reg].next = curr->next; + DependencyEntry::mem_alloc_counter--; + delete curr; } @@ -749,13 +873,9 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst) } dependGraph[dest_reg].inst = new_inst; -#if 0 - if (dependGraph[dest_reg].next) { - panic("Dependency chain of dest reg %i is not empty.\n", - dest_reg); - } -#endif + assert(!dependGraph[dest_reg].next); + // Mark the scoreboard to say it's not yet ready. regScoreboard[dest_reg] = false; } @@ -776,6 +896,8 @@ InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst) // Then actually add it to the chain. this->next = new_entry; + + ++mem_alloc_counter; } template <class Impl> @@ -805,6 +927,8 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove) // Now remove this instruction from the list. prev->next = curr->next; + --mem_alloc_counter; + delete curr; } @@ -855,12 +979,26 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst) DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n"); + // Message to the mem dependence unit that this instruction has + // its registers ready. + + memDepUnit.regsReady(inst); + +#if 0 if (memDepUnit.readyToIssue(inst)) { DPRINTF(IQ, "IQ: Memory instruction is ready to issue, " "putting it onto the ready list, PC %#x.\n", inst->readPC()); readyMemInsts.push(inst); + } else { + // Make dependent on the store. + // Will need some way to get the store instruction it should + // be dependent upon; then when the store issues it can + // put the instruction on the ready list. + // Yet another tree? + assert(0 && "Instruction has no way to actually issue"); } +#endif } else if (inst->isInteger()) { @@ -923,7 +1061,7 @@ InstructionQueue<Impl>::dumpLists() cprintf("Ready branch list size: %i\n", readyBranchInsts.size()); - cprintf("Ready memory list size: %i\n", readyMemInsts.size()); +// cprintf("Ready memory list size: %i\n", readyMemInsts.size()); cprintf("Ready misc list size: %i\n", readyMiscInsts.size()); diff --git a/cpu/beta_cpu/mem_dep_unit.hh b/cpu/beta_cpu/mem_dep_unit.hh index 4821c63b7..e43543e09 100644 --- a/cpu/beta_cpu/mem_dep_unit.hh +++ b/cpu/beta_cpu/mem_dep_unit.hh @@ -6,6 +6,7 @@ #include <map> #include "cpu/inst_seq.hh" +#include "base/statistics.hh" /** * Memory dependency unit class. This holds the memory dependence predictor. @@ -25,16 +26,17 @@ class MemDepUnit { typedef typename Impl::DynInstPtr DynInstPtr; public: - typedef typename std::set<InstSeqNum>::iterator sn_it_t; - typedef typename std::map<InstSeqNum, vector<InstSeqNum> >::iterator - dep_it_t; - - public: MemDepUnit(Params ¶ms); + void regStats(); + void insert(DynInstPtr &inst); - bool readyToIssue(DynInstPtr &inst); + void insertNonSpec(DynInstPtr &inst); + + void regsReady(DynInstPtr &inst); + + void nonSpecInstReady(DynInstPtr &inst); void issue(DynInstPtr &inst); @@ -44,19 +46,83 @@ class MemDepUnit { void violation(DynInstPtr &store_inst, DynInstPtr &violating_load); + // Will want to make this operation relatively fast. Right now it + // kind of sucks. + DynInstPtr &top(); + + void pop(); + + inline bool empty() + { return readyInsts.empty(); } + + private: + typedef typename std::set<InstSeqNum>::iterator sn_it_t; + typedef typename std::map<InstSeqNum, DynInstPtr>::iterator dyn_it_t; + + // Forward declarations so that the following two typedefs work. + class Dependency; + class ltDependency; + + typedef typename std::set<Dependency, ltDependency>::iterator dep_it_t; + typedef typename std::map<InstSeqNum, vector<dep_it_t> >::iterator + sd_it_t; + + struct Dependency { + Dependency(const InstSeqNum &_seqNum) + : seqNum(_seqNum), regsReady(0), memDepReady(0) + { } + + Dependency(const InstSeqNum &_seqNum, bool _regsReady, + bool _memDepReady) + : seqNum(_seqNum), regsReady(_regsReady), + memDepReady(_memDepReady) + { } + + InstSeqNum seqNum; + mutable bool regsReady; + mutable bool memDepReady; + mutable sd_it_t storeDep; + }; + + struct ltDependency { + bool operator() (const Dependency &lhs, const Dependency &rhs) + { + return lhs.seqNum < rhs.seqNum; + } + }; + + + private: + inline void moveToReady(dep_it_t &woken_inst); + private: /** List of instructions that have passed through rename, yet are still - * waiting on a memory dependence to resolve before they can issue. + * waiting on either a memory dependence to resolve or source registers to + * become available before they can issue. */ - std::set<InstSeqNum> renamedInsts; + std::set<Dependency, ltDependency> waitingInsts; /** List of instructions that have all their predicted memory dependences - * resolved. They are ready in terms of being free of memory - * dependences; however they may still have to wait on source registers. + * resolved and their source registers ready. */ std::set<InstSeqNum> readyInsts; - std::map<InstSeqNum, vector<InstSeqNum> > dependencies; + // Change this to hold a vector of iterators, which will point to the + // entry of the waiting instructions. + /** List of stores' sequence numbers, each of which has a vector of + * iterators. The iterators point to the appropriate node within + * waitingInsts that has the depenendent instruction. + */ + std::map<InstSeqNum, vector<dep_it_t> > storeDependents; + + // For now will implement this as a map...hash table might not be too + // bad, or could move to something that mimics the current dependency + // graph. + std::map<InstSeqNum, DynInstPtr> memInsts; + + // Iterator pointer to the top instruction which has is ready. + // Is set by the top() call. + dyn_it_t topInst; /** The memory dependence predictor. It is accessed upon new * instructions being added to the IQ, and responds by telling @@ -65,6 +131,10 @@ class MemDepUnit { */ MemDepPred depPred; + Stats::Scalar<> insertedLoads; + Stats::Scalar<> insertedStores; + Stats::Scalar<> conflictingLoads; + Stats::Scalar<> conflictingStores; }; #endif diff --git a/cpu/beta_cpu/mem_dep_unit_impl.hh b/cpu/beta_cpu/mem_dep_unit_impl.hh index 4299acb7a..4161ac2a8 100644 --- a/cpu/beta_cpu/mem_dep_unit_impl.hh +++ b/cpu/beta_cpu/mem_dep_unit_impl.hh @@ -3,108 +3,301 @@ #include "cpu/beta_cpu/mem_dep_unit.hh" -// Hack: dependence predictor sizes are hardcoded. template <class MemDepPred, class Impl> MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params ¶ms) - : depPred(4028, 128) + : depPred(params.SSITSize, params.LFSTSize) { DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n"); } template <class MemDepPred, class Impl> void +MemDepUnit<MemDepPred, Impl>::regStats() +{ + insertedLoads + .name(name() + ".memDep.insertedLoads") + .desc("Number of loads inserted to the mem dependence unit."); + + insertedStores + .name(name() + ".memDep.insertedStores") + .desc("Number of stores inserted to the mem dependence unit."); + + conflictingLoads + .name(name() + ".memDep.conflictingLoads") + .desc("Number of conflicting loads."); + + conflictingStores + .name(name() + ".memDep.conflictingStores") + .desc("Number of conflicting stores."); +} + +template <class MemDepPred, class Impl> +void MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst) { InstSeqNum inst_seq_num = inst->seqNum; + Dependency unresolved_dependencies(inst_seq_num); InstSeqNum producing_store = depPred.checkInst(inst->readPC()); if (producing_store == 0 || - dependencies.find(producing_store) == dependencies.end()) { - readyInsts.insert(inst_seq_num); + storeDependents.find(producing_store) == storeDependents.end()) { + + DPRINTF(MemDepUnit, "MemDepUnit: No dependency for inst PC " + "%#x.\n", inst->readPC()); + + unresolved_dependencies.storeDep = storeDependents.end(); + + if (inst->readyToIssue()) { + readyInsts.insert(inst_seq_num); + } else { + unresolved_dependencies.memDepReady = true; + + waitingInsts.insert(unresolved_dependencies); + } } else { + DPRINTF(MemDepUnit, "MemDepUnit: Adding to dependency list; " + "inst PC %#x is dependent on seq num %i.\n", + inst->readPC(), producing_store); + + if (inst->readyToIssue()) { + unresolved_dependencies.regsReady = true; + } + + // Find the store that this instruction is dependent on. + sd_it_t store_loc = storeDependents.find(producing_store); + + assert(store_loc != storeDependents.end()); + + // Record the location of the store that this instruction is + // dependent on. + unresolved_dependencies.storeDep = store_loc; + // If it's not already ready, then add it to the renamed // list and the dependencies. - renamedInsts.insert(inst_seq_num); + dep_it_t inst_loc = + (waitingInsts.insert(unresolved_dependencies)).first; + + // Add this instruction to the list of dependents. + (*store_loc).second.push_back(inst_loc); + + assert(!(*store_loc).second.empty()); - dependencies[producing_store].push_back(inst_seq_num); + if (inst->isLoad()) { + ++conflictingLoads; + } else { + ++conflictingStores; + } } if (inst->isStore()) { + DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n", + inst->readPC()); + depPred.insertStore(inst->readPC(), inst_seq_num); // Make sure this store isn't already in this list. - assert(dependencies.find(inst_seq_num) == dependencies.end()); + assert(storeDependents.find(inst_seq_num) == storeDependents.end()); // Put a dependency entry in at the store's sequence number. // Uh, not sure how this works...I want to create an entry but // I don't have anything to put into the value yet. - dependencies[inst_seq_num]; - } else if (!inst->isLoad()) { + storeDependents[inst_seq_num]; + + assert(storeDependents.size() != 0); + + ++insertedStores; + + } else if (inst->isLoad()) { + ++insertedLoads; + } else { + panic("MemDepUnit: Unknown type! (most likely a barrier)."); + } + + memInsts[inst_seq_num] = inst; +} + +template <class MemDepPred, class Impl> +void +MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst) +{ + InstSeqNum inst_seq_num = inst->seqNum; + + Dependency non_spec_inst(inst_seq_num); + + non_spec_inst.storeDep = storeDependents.end(); + + waitingInsts.insert(non_spec_inst); + + // Might want to turn this part into an inline function or something. + // It's shared between both insert functions. + if (inst->isStore()) { + DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n", + inst->readPC()); + + depPred.insertStore(inst->readPC(), inst_seq_num); + + // Make sure this store isn't already in this list. + assert(storeDependents.find(inst_seq_num) == storeDependents.end()); + + // Put a dependency entry in at the store's sequence number. + // Uh, not sure how this works...I want to create an entry but + // I don't have anything to put into the value yet. + storeDependents[inst_seq_num]; + + assert(storeDependents.size() != 0); + + ++insertedStores; + + } else if (inst->isLoad()) { + ++insertedLoads; + } else { panic("MemDepUnit: Unknown type! (most likely a barrier)."); } + + memInsts[inst_seq_num] = inst; +} + +template <class MemDepPred, class Impl> +typename Impl::DynInstPtr & +MemDepUnit<MemDepPred, Impl>::top() +{ + topInst = memInsts.find( (*readyInsts.begin()) ); + + DPRINTF(MemDepUnit, "MemDepUnit: Top instruction is PC %#x.\n", + (*topInst).second->readPC()); + + return (*topInst).second; } template <class MemDepPred, class Impl> -bool -MemDepUnit<MemDepPred, Impl>::readyToIssue(DynInstPtr &inst) +void +MemDepUnit<MemDepPred, Impl>::pop() { + DPRINTF(MemDepUnit, "MemDepUnit: Removing instruction PC %#x.\n", + (*topInst).second->readPC()); + + wakeDependents((*topInst).second); + + issue((*topInst).second); + + memInsts.erase(topInst); + + topInst = memInsts.end(); +} + +template <class MemDepPred, class Impl> +void +MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst) +{ + DPRINTF(MemDepUnit, "MemDepUnit: Marking registers as ready for " + "instruction PC %#x.\n", + inst->readPC()); + InstSeqNum inst_seq_num = inst->seqNum; - if (readyInsts.find(inst_seq_num) == readyInsts.end()) { - return false; + Dependency inst_to_find(inst_seq_num); + + dep_it_t waiting_inst = waitingInsts.find(inst_to_find); + + assert(waiting_inst != waitingInsts.end()); + + if ((*waiting_inst).memDepReady) { + DPRINTF(MemDepUnit, "MemDepUnit: Instruction has its memory " + "dependencies resolved, adding it to the ready list.\n"); + + moveToReady(waiting_inst); } else { - return true; + DPRINTF(MemDepUnit, "MemDepUnit: Instruction still waiting on " + "memory dependency.\n"); + + (*waiting_inst).regsReady = true; } } template <class MemDepPred, class Impl> void +MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst) +{ + DPRINTF(MemDepUnit, "MemDepUnit: Marking non speculative " + "instruction PC %#x as ready.\n", + inst->readPC()); + + InstSeqNum inst_seq_num = inst->seqNum; + + Dependency inst_to_find(inst_seq_num); + + dep_it_t waiting_inst = waitingInsts.find(inst_to_find); + + assert(waiting_inst != waitingInsts.end()); + + moveToReady(waiting_inst); +} + +template <class MemDepPred, class Impl> +void MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst) { assert(readyInsts.find(inst->seqNum) != readyInsts.end()); + DPRINTF(MemDepUnit, "MemDepUnit: Issuing instruction PC %#x.\n", + inst->readPC()); + // Remove the instruction from the ready list. readyInsts.erase(inst->seqNum); + + depPred.issued(inst->readPC(), inst->seqNum, inst->isStore()); } template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst) { + // Only stores have dependents. + if (!inst->isStore()) { + return; + } + // Wake any dependencies. - dep_it_t dep_it = dependencies.find(inst); + sd_it_t sd_it = storeDependents.find(inst->seqNum); // If there's no entry, then return. Really there should only be // no entry if the instruction is a load. - if (dep_it == dependencies.end()) { + if (sd_it == storeDependents.end()) { + DPRINTF(MemDepUnit, "MemDepUnit: Instruction PC %#x, sequence " + "number %i has no dependents.\n", + inst->readPC(), inst->seqNum); + return; } - assert(inst->isStore()); - - for(int i = 0; i < (*dep_it).second.size(); ++i ) { - InstSeqNum woken_inst = (*dep_it).second[i]; + for (int i = 0; i < (*sd_it).second.size(); ++i ) { + dep_it_t woken_inst = (*sd_it).second[i]; + DPRINTF(MemDepUnit, "MemDepUnit: Waking up a dependent inst, " + "sequence number %i.\n", + (*woken_inst).seqNum); +#if 0 // Should we have reached instructions that are actually squashed, // there will be no more useful instructions in this dependency // list. Break out early. - if (renamedInsts.find(woken_inst) == renamedInsts.end()) { + if (waitingInsts.find(woken_inst) == waitingInsts.end()) { DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x " "are squashed, starting at SN %i. Breaking early.\n", inst->readPC(), woken_inst); break; } +#endif - // Remove it from the renamed instructions. - renamedInsts.erase(woken_inst); - - // Add it to the ready list. - readyInsts.insert(woken_inst); + if ((*woken_inst).regsReady) { + moveToReady(woken_inst); + } else { + (*woken_inst).memDepReady = true; + } } - dependencies.erase(dep_it); + storeDependents.erase(sd_it); } template <class MemDepPred, class Impl> @@ -112,17 +305,30 @@ void MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num) { - if (!renamedInsts.empty()) { - sn_it_t renamed_it = renamedInsts.end(); + if (!waitingInsts.empty()) { + dep_it_t waiting_it = waitingInsts.end(); - --renamed_it; + --waiting_it; // Remove entries from the renamed list as long as we haven't reached // the end and the entries continue to be younger than the squashed. - while (!renamedInsts.empty() && - (*renamed_it) > squashed_num) + while (!waitingInsts.empty() && + (*waiting_it).seqNum > squashed_num) { - renamedInsts.erase(renamed_it--); + if (!(*waiting_it).memDepReady && + (*waiting_it).storeDep != storeDependents.end()) { + sd_it_t sd_it = (*waiting_it).storeDep; + + // Make sure the iterator that the store has pointing + // back is actually to this instruction. + assert((*sd_it).second.back() == waiting_it); + + // Now remove this from the store's list of dependent + // instructions. + (*sd_it).second.pop_back(); + } + + waitingInsts.erase(waiting_it--); } } @@ -139,16 +345,19 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num) } } - if (!dependencies.empty()) { - dep_it_t dep_it = dependencies.end(); + if (!storeDependents.empty()) { + sd_it_t dep_it = storeDependents.end(); --dep_it; // Same for the dependencies list. - while (!dependencies.empty() && + while (!storeDependents.empty() && (*dep_it).first > squashed_num) { - dependencies.erase(dep_it--); + // This store's list of dependent instructions should be empty. + assert((*dep_it).second.empty()); + + storeDependents.erase(dep_it--); } } @@ -161,6 +370,23 @@ void MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst, DynInstPtr &violating_load) { + DPRINTF(MemDepUnit, "MemDepUnit: Passing violating PCs to store sets," + " load: %#x, store: %#x\n", violating_load->readPC(), + store_inst->readPC()); // Tell the memory dependence unit of the violation. depPred.violation(violating_load->readPC(), store_inst->readPC()); } + +template <class MemDepPred, class Impl> +inline void +MemDepUnit<MemDepPred, Impl>::moveToReady(dep_it_t &woken_inst) +{ + DPRINTF(MemDepUnit, "MemDepUnit: Adding instruction sequence number %i " + "to the ready list.\n", (*woken_inst).seqNum); + + // Add it to the ready list. + readyInsts.insert((*woken_inst).seqNum); + + // Remove it from the waiting instructions. + waitingInsts.erase(woken_inst); +} diff --git a/cpu/beta_cpu/ras.cc b/cpu/beta_cpu/ras.cc new file mode 100644 index 000000000..ca05f5a0d --- /dev/null +++ b/cpu/beta_cpu/ras.cc @@ -0,0 +1,42 @@ +#include "cpu/beta_cpu/ras.hh" + +ReturnAddrStack::ReturnAddrStack(unsigned _numEntries) + : numEntries(_numEntries), usedEntries(0), + tos(0) +{ + addrStack = new Addr[numEntries](0); +} + +void +ReturnAddrStack::push(const Addr &return_addr) +{ + incrTos(); + + addrStack[tos] = return_addr; + + if (usedEntries != numEntries) { + ++usedEntries; + } +} + +void +ReturnAddrStack::pop() +{ + // Not sure it's possible to really track usedEntries properly. +// assert(usedEntries > 0); + + if (usedEntries > 0) { + --usedEntries; + } + + decrTos(); +} + +void +ReturnAddrStack::restore(unsigned top_entry_idx, + const Addr &restored_target) +{ + tos = top_entry_idx; + + addrStack[tos] = restored_target; +} diff --git a/cpu/beta_cpu/ras.hh b/cpu/beta_cpu/ras.hh new file mode 100644 index 000000000..7666f825f --- /dev/null +++ b/cpu/beta_cpu/ras.hh @@ -0,0 +1,40 @@ +#ifndef __RAS_HH__ +#define __RAS_HH__ + +// For Addr type. +#include "arch/alpha/isa_traits.hh" + +class ReturnAddrStack +{ + public: + ReturnAddrStack(unsigned numEntries); + + Addr top() + { return addrStack[tos]; } + + unsigned topIdx() + { return tos; } + + void push(const Addr &return_addr); + + void pop(); + + void restore(unsigned top_entry_idx, const Addr &restored_target); + + private: + inline void incrTos() + { tos = (tos + 1) % numEntries; } + + inline void decrTos() + { tos = (tos == 0 ? numEntries - 1 : tos - 1); } + + Addr *addrStack; + + unsigned numEntries; + + unsigned usedEntries; + + unsigned tos; +}; + +#endif // __RAS_HH__ diff --git a/cpu/beta_cpu/regfile.hh b/cpu/beta_cpu/regfile.hh index aba897fdc..148d9408a 100644 --- a/cpu/beta_cpu/regfile.hh +++ b/cpu/beta_cpu/regfile.hh @@ -54,7 +54,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Access to float register %i as single, has " "data %8.8f\n", int(reg_idx), (float)floatRegFile[reg_idx].d); @@ -67,7 +67,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Access to float register %i as double, has " " data %8.8f\n", int(reg_idx), floatRegFile[reg_idx].d); @@ -80,7 +80,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Access to float register %i as int, has data " "%lli\n", int(reg_idx), floatRegFile[reg_idx].q); @@ -103,7 +103,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n", int(reg_idx), val); @@ -116,7 +116,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n", int(reg_idx), val); @@ -129,7 +129,7 @@ class PhysRegFile // Remove the base Float reg dependency. reg_idx = reg_idx - numPhysicalIntRegs; - assert(reg_idx < numPhysicalFloatRegs); + assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs); DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n", int(reg_idx), val); diff --git a/cpu/beta_cpu/rename.hh b/cpu/beta_cpu/rename.hh index 9f031012a..3e6b873ae 100644 --- a/cpu/beta_cpu/rename.hh +++ b/cpu/beta_cpu/rename.hh @@ -54,6 +54,8 @@ class SimpleRename public: SimpleRename(Params ¶ms); + void regStats(); + void setCPU(FullCPU *cpu_ptr); void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); @@ -182,6 +184,22 @@ class SimpleRename * group of instructions, it can restart at the proper instruction. */ unsigned numInst; + + Stats::Scalar<> renameSquashCycles; + Stats::Scalar<> renameIdleCycles; + Stats::Scalar<> renameBlockCycles; + Stats::Scalar<> renameUnblockCycles; + Stats::Scalar<> renameRenamedInsts; + Stats::Scalar<> renameSquashedInsts; + Stats::Scalar<> renameROBFullEvents; + Stats::Scalar<> renameIQFullEvents; + Stats::Scalar<> renameFullRegistersEvents; + Stats::Scalar<> renameRenamedOperands; + Stats::Scalar<> renameRenameLookups; + Stats::Scalar<> renameHBPlaceHolders; + Stats::Scalar<> renameCommittedMaps; + Stats::Scalar<> renameUndoneMaps; + Stats::Scalar<> renameValidUndoneMaps; }; #endif // __SIMPLE_RENAME_HH__ diff --git a/cpu/beta_cpu/rename_impl.hh b/cpu/beta_cpu/rename_impl.hh index 47464d961..5a8e499e9 100644 --- a/cpu/beta_cpu/rename_impl.hh +++ b/cpu/beta_cpu/rename_impl.hh @@ -16,6 +16,72 @@ SimpleRename<Impl>::SimpleRename(Params ¶ms) template <class Impl> void +SimpleRename<Impl>::regStats() +{ + renameSquashCycles + .name(name() + ".renameSquashCycles") + .desc("Number of cycles rename is squashing") + .prereq(renameSquashCycles); + renameIdleCycles + .name(name() + ".renameIdleCycles") + .desc("Number of cycles rename is idle") + .prereq(renameIdleCycles); + renameBlockCycles + .name(name() + ".renameBlockCycles") + .desc("Number of cycles rename is blocking") + .prereq(renameBlockCycles); + renameUnblockCycles + .name(name() + ".renameUnblockCycles") + .desc("Number of cycles rename is unblocking") + .prereq(renameUnblockCycles); + renameRenamedInsts + .name(name() + ".renameRenamedInsts") + .desc("Number of instructions processed by rename") + .prereq(renameRenamedInsts); + renameSquashedInsts + .name(name() + ".renameSquashedInsts") + .desc("Number of squashed instructions processed by rename") + .prereq(renameSquashedInsts); + renameROBFullEvents + .name(name() + ".renameROBFullEvents") + .desc("Number of times rename has considered the ROB 'full'") + .prereq(renameROBFullEvents); + renameIQFullEvents + .name(name() + ".renameIQFullEvents") + .desc("Number of times rename has considered the IQ 'full'") + .prereq(renameIQFullEvents); + renameFullRegistersEvents + .name(name() + ".renameFullRegisterEvents") + .desc("Number of times there has been no free registers") + .prereq(renameFullRegistersEvents); + renameRenamedOperands + .name(name() + ".renameRenamedOperands") + .desc("Number of destination operands rename has renamed") + .prereq(renameRenamedOperands); + renameRenameLookups + .name(name() + ".renameRenameLookups") + .desc("Number of register rename lookups that rename has made") + .prereq(renameRenameLookups); + renameHBPlaceHolders + .name(name() + ".renameHBPlaceHolders") + .desc("Number of place holders added to the history buffer") + .prereq(renameHBPlaceHolders); + renameCommittedMaps + .name(name() + ".renameCommittedMaps") + .desc("Number of HB maps that are committed") + .prereq(renameCommittedMaps); + renameUndoneMaps + .name(name() + ".renameUndoneMaps") + .desc("Number of HB maps that are undone due to squashing") + .prereq(renameUndoneMaps); + renameValidUndoneMaps + .name(name() + ".renameValidUndoneMaps") + .desc("Number of HB maps that are undone, and are not place holders") + .prereq(renameValidUndoneMaps); +} + +template <class Impl> +void SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr) { DPRINTF(Rename, "Rename: Setting CPU pointer.\n"); @@ -59,7 +125,6 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr) // Setup wire to get information from decode. fromDecode = decodeQueue->getWire(-decodeToRenameDelay); - } template <class Impl> @@ -124,7 +189,7 @@ SimpleRename<Impl>::unblock() // continue to tell previous stages to stall. They will be // able to restart once the skid buffer is empty. if (!skidBuffer.empty()) { - toDecode->renameInfo.stall = true; + toDecode->renameInfo.stall = true; } else { DPRINTF(Rename, "Rename: Done unblocking.\n"); _status = Running; @@ -136,7 +201,6 @@ void SimpleRename<Impl>::doSquash() { typename list<RenameHistory>::iterator hb_it = historyBuffer.begin(); -// typename list<RenameHistory>::iterator delete_it; InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum; @@ -154,6 +218,8 @@ SimpleRename<Impl>::doSquash() // they did and freeing up the registers. while ((*hb_it).instSeqNum > squashed_seq_num) { + assert(hb_it != historyBuffer.end()); + DPRINTF(Rename, "Rename: Removing history entry with sequence " "number %i.\n", (*hb_it).instSeqNum); @@ -165,15 +231,13 @@ SimpleRename<Impl>::doSquash() // Put the renamed physical register back on the free list. freeList->addReg(hb_it->newPhysReg); - } -// delete_it = hb_it; - -// hb_it++; + ++renameValidUndoneMaps; + } historyBuffer.erase(hb_it++); - assert(hb_it != historyBuffer.end()); + ++renameUndoneMaps; } } @@ -196,9 +260,6 @@ SimpleRename<Impl>::squash() doSquash(); } -// In the future, when a SmartPtr is used for DynInst, then this function -// itself can handle returning the instruction's physical registers to -// the free list. template<class Impl> void SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num) @@ -233,19 +294,20 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num) if (!(*hb_it).placeHolder) { freeList->addReg((*hb_it).prevPhysReg); + ++renameCommittedMaps; } historyBuffer.erase(hb_it--); } - // Finally free up the previous register of the squashed instruction + // Finally free up the previous register of the finished instruction // itself. if (!(*hb_it).placeHolder) { freeList->addReg(hb_it->prevPhysReg); + ++renameCommittedMaps; } historyBuffer.erase(hb_it); - } template <class Impl> @@ -263,7 +325,7 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst) // Look up the source registers to get the phys. register they've // been renamed to, and set the sources to those registers. - RegIndex renamed_reg = renameMap->lookup(src_reg); + PhysRegIndex renamed_reg = renameMap->lookup(src_reg); DPRINTF(Rename, "Rename: Looking up arch reg %i, got " "physical reg %i.\n", (int)src_reg, (int)renamed_reg); @@ -278,6 +340,8 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst) inst->markSrcRegReady(src_idx); } + + ++renameRenameLookups; } } @@ -289,40 +353,6 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst) unsigned num_dest_regs = inst->numDestRegs(); - // Rename the destination registers. - for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++) - { - RegIndex dest_reg = inst->destRegIdx(dest_idx); - - // Get the physical register that the destination will be - // renamed to. - rename_result = renameMap->rename(dest_reg); - - DPRINTF(Rename, "Rename: Renaming arch reg %i to physical " - "reg %i.\n", (int)dest_reg, - (int)rename_result.first); - - // Record the rename information so that a history can be kept. - RenameHistory hb_entry(inst->seqNum, dest_reg, - rename_result.first, - rename_result.second); - - historyBuffer.push_front(hb_entry); - - DPRINTF(Rename, "Rename: Adding instruction to history buffer, " - "sequence number %lli.\n", - (*historyBuffer.begin()).instSeqNum); - - // Tell the instruction to rename the appropriate destination - // register (dest_idx) to the new physical register - // (rename_result.first), and record the previous physical - // register that the same logical register was renamed to - // (rename_result.second). - inst->renameDestReg(dest_idx, - rename_result.first, - rename_result.second); - } - // If it's an instruction with no destination registers, then put // a placeholder within the history buffer. It might be better // to not put it in the history buffer at all (other than branches, @@ -337,6 +367,45 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst) DPRINTF(Rename, "Rename: Adding placeholder instruction to " "history buffer, sequence number %lli.\n", inst->seqNum); + + ++renameHBPlaceHolders; + } else { + + // Rename the destination registers. + for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++) + { + RegIndex dest_reg = inst->destRegIdx(dest_idx); + + // Get the physical register that the destination will be + // renamed to. + rename_result = renameMap->rename(dest_reg); + + DPRINTF(Rename, "Rename: Renaming arch reg %i to physical " + "reg %i.\n", (int)dest_reg, + (int)rename_result.first); + + // Record the rename information so that a history can be kept. + RenameHistory hb_entry(inst->seqNum, dest_reg, + rename_result.first, + rename_result.second); + + historyBuffer.push_front(hb_entry); + + DPRINTF(Rename, "Rename: Adding instruction to history buffer, " + "sequence number %lli.\n", + (*historyBuffer.begin()).instSeqNum); + + // Tell the instruction to rename the appropriate destination + // register (dest_idx) to the new physical register + // (rename_result.first), and record the previous physical + // register that the same logical register was renamed to + // (rename_result.second). + inst->renameDestReg(dest_idx, + rename_result.first, + rename_result.second); + + ++renameRenamedOperands; + } } } @@ -379,6 +448,8 @@ SimpleRename<Impl>::tick() // buffer were used. Remove those instructions and handle // the rest of unblocking. if (_status == Unblocking) { + ++renameUnblockCycles; + if (fromDecode->size > 0) { // Add the current inputs onto the skid buffer, so they can be // reprocessed when this stage unblocks. @@ -388,6 +459,8 @@ SimpleRename<Impl>::tick() unblock(); } } else if (_status == Blocked) { + ++renameBlockCycles; + // If stage is blocked and still receiving valid instructions, // make sure to store them in the skid buffer. if (fromDecode->size > 0) { @@ -425,6 +498,8 @@ SimpleRename<Impl>::tick() return; } } else if (_status == Squashing) { + ++renameSquashCycles; + if (fromCommit->commitInfo.squash) { squash(); } else if (!fromCommit->commitInfo.squash && @@ -439,7 +514,13 @@ SimpleRename<Impl>::tick() // Ugly code, revamp all of the tick() functions eventually. if (fromCommit->commitInfo.doneSeqNum != 0 && _status != Squashing) { +#ifndef FULL_SYSTEM + if (!fromCommit->commitInfo.squash) { + removeFromHistory(fromCommit->commitInfo.doneSeqNum); + } +#else removeFromHistory(fromCommit->commitInfo.doneSeqNum); +#endif } // Perhaps put this outside of this function, since this will @@ -539,6 +620,12 @@ SimpleRename<Impl>::rename() // Tell previous stage to stall. toDecode->renameInfo.stall = true; + if (free_rob_entries <= 0) { + ++renameROBFullEvents; + } else { + ++renameIQFullEvents; + } + return; } else if (min_iq_rob < insts_available) { DPRINTF(Rename, "Rename: Will have to block this cycle. Only " @@ -548,6 +635,12 @@ SimpleRename<Impl>::rename() insts_available = min_iq_rob; block_this_cycle = true; + + if (free_rob_entries < free_iq_entries) { + ++renameROBFullEvents; + } else { + ++renameIQFullEvents; + } } while (insts_available > 0) { @@ -566,6 +659,8 @@ SimpleRename<Impl>::rename() // Go to the next instruction. ++numInst; + ++renameSquashedInsts; + // Decrement how many instructions are available. --insts_available; @@ -606,6 +701,8 @@ SimpleRename<Impl>::rename() block_this_cycle = true; + ++renameFullRegistersEvents; + break; } @@ -625,6 +722,8 @@ SimpleRename<Impl>::rename() ++to_iew_index; ++numInst; + ++renameRenamedInsts; + // Decrement how many instructions are available. --insts_available; } diff --git a/cpu/beta_cpu/rename_map.cc b/cpu/beta_cpu/rename_map.cc index cb9720d28..1301202f2 100644 --- a/cpu/beta_cpu/rename_map.cc +++ b/cpu/beta_cpu/rename_map.cc @@ -72,7 +72,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs, floatRenameMap[index].physical_reg = float_reg_idx++; } - for (RegIndex index = numPhysicalIntRegs; + for (PhysRegIndex index = numPhysicalIntRegs; index < numPhysicalIntRegs + numLogicalFloatRegs; ++index) { floatScoreboard[index] = 1; @@ -88,7 +88,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs, } // Initialize the entries in the misc register scoreboard to be ready. - for (RegIndex index = numPhysicalRegs; + for (PhysRegIndex index = numPhysicalRegs; index < numPhysicalRegs + numMiscRegs; ++index) { miscScoreboard[index] = 1; diff --git a/cpu/beta_cpu/rob_impl.hh b/cpu/beta_cpu/rob_impl.hh index 862008429..86c4e2db1 100644 --- a/cpu/beta_cpu/rob_impl.hh +++ b/cpu/beta_cpu/rob_impl.hh @@ -139,9 +139,7 @@ bool ROB<Impl>::isHeadReady() { if (numInstsInROB != 0) { - DynInstPtr head_inst = cpu->instList.front(); - - return head_inst->readyToCommit(); + return cpu->instList.front()->readyToCommit(); } return false; diff --git a/cpu/beta_cpu/store_set.cc b/cpu/beta_cpu/store_set.cc index 46d763d37..a5458685d 100644 --- a/cpu/beta_cpu/store_set.cc +++ b/cpu/beta_cpu/store_set.cc @@ -5,6 +5,8 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size) : SSIT_size(_SSIT_size), LFST_size(_LFST_size) { DPRINTF(StoreSet, "StoreSet: Creating store set object.\n"); + DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n", + SSIT_size, LFST_size); SSIT = new SSID[SSIT_size]; @@ -31,11 +33,13 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size) } void -StoreSet::violation(Addr load_PC, Addr store_PC) +StoreSet::violation(Addr store_PC, Addr load_PC) { int load_index = calcIndex(load_PC); int store_index = calcIndex(store_PC); + assert(load_index < SSIT_size && store_index < SSIT_size); + bool valid_load_SSID = validSSIT[load_index]; bool valid_store_SSID = validSSIT[store_index]; @@ -51,7 +55,14 @@ StoreSet::violation(Addr load_PC, Addr store_PC) SSIT[store_index] = new_set; + assert(new_set < LFST_size); + SSCounters[new_set]++; + + + DPRINTF(StoreSet, "StoreSet: Neither load nor store had a valid " + "storeset, creating a new one: %i for load %#x, store %#x\n", + new_set, load_PC, store_PC); } else if (valid_load_SSID && !valid_store_SSID) { SSID load_SSID = SSIT[load_index]; @@ -59,7 +70,13 @@ StoreSet::violation(Addr load_PC, Addr store_PC) SSIT[store_index] = load_SSID; + assert(load_SSID < LFST_size); + SSCounters[load_SSID]++; + + DPRINTF(StoreSet, "StoreSet: Load had a valid store set. Adding " + "store to that set: %i for load %#x, store %#x\n", + load_SSID, load_PC, store_PC); } else if (!valid_load_SSID && valid_store_SSID) { SSID store_SSID = SSIT[store_index]; @@ -69,10 +86,16 @@ StoreSet::violation(Addr load_PC, Addr store_PC) // Because we are having a load point to an already existing set, // the size of the store set is not incremented. + + DPRINTF(StoreSet, "StoreSet: Store had a valid store set: %i for " + "load %#x, store %#x\n", + store_SSID, load_PC, store_PC); } else { SSID load_SSID = SSIT[load_index]; SSID store_SSID = SSIT[store_index]; + assert(load_SSID < LFST_size && store_SSID < LFST_size); + int load_SS_size = SSCounters[load_SSID]; int store_SS_size = SSCounters[store_SSID]; @@ -83,11 +106,19 @@ StoreSet::violation(Addr load_PC, Addr store_PC) SSCounters[load_SSID]++; SSCounters[store_SSID]--; + + DPRINTF(StoreSet, "StoreSet: Load had bigger store set: %i; " + "for load %#x, store %#x\n", + load_SSID, load_PC, store_PC); } else { SSIT[load_index] = store_SSID; SSCounters[store_SSID]++; SSCounters[load_SSID]--; + + DPRINTF(StoreSet, "StoreSet: Store had bigger store set: %i; " + "for load %#x, store %#x\n", + store_SSID, load_PC, store_PC); } } } @@ -106,6 +137,8 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num) int store_SSID; + assert(index < SSIT_size); + if (!validSSIT[index]) { // Do nothing if there's no valid entry. return; @@ -116,6 +149,11 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num) // Update the last store that was fetched with the current one. LFST[store_SSID] = store_seq_num; + + validLFST[store_SSID] = 1; + + DPRINTF(StoreSet, "Store %#x updated the LFST, SSID: %i\n", + store_PC, store_SSID); } } @@ -126,7 +164,12 @@ StoreSet::checkInst(Addr PC) int inst_SSID; + assert(index < SSIT_size); + if (!validSSIT[index]) { + DPRINTF(StoreSet, "Inst %#x with index %i had no SSID\n", + PC, index); + // Return 0 if there's no valid entry. return 0; } else { @@ -135,8 +178,15 @@ StoreSet::checkInst(Addr PC) assert(inst_SSID < LFST_size); if (!validLFST[inst_SSID]) { + + DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had no " + "dependency\n", PC, index, inst_SSID); + return 0; } else { + DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had LFST " + "inum of %i\n", PC, index, inst_SSID, LFST[inst_SSID]); + return LFST[inst_SSID]; } } @@ -154,14 +204,21 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store) int store_SSID; + assert(index < SSIT_size); + // Make sure the SSIT still has a valid entry for the issued store. - assert(validSSIT[index]); + if (!validSSIT[index]) { + return; + } store_SSID = SSIT[index]; + assert(store_SSID < LFST_size); + // If the last fetched store in the store set refers to the store that // was just issued, then invalidate the entry. if (validLFST[store_SSID] && LFST[store_SSID] == issued_seq_num) { + DPRINTF(StoreSet, "StoreSet: store invalidated itself in LFST.\n"); validLFST[store_SSID] = false; } } @@ -170,9 +227,14 @@ void StoreSet::squash(InstSeqNum squashed_num) { // Not really sure how to do this well. + // Generally this is small enough that it should be okay; short circuit + // evaluation should take care of invalid entries. + + DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n", + squashed_num); for (int i = 0; i < LFST_size; ++i) { - if (LFST[i] < squashed_num) { + if (validLFST[i] && LFST[i] < squashed_num) { validLFST[i] = false; } } diff --git a/cpu/beta_cpu/store_set.hh b/cpu/beta_cpu/store_set.hh index 701c60a2d..b634a180d 100644 --- a/cpu/beta_cpu/store_set.hh +++ b/cpu/beta_cpu/store_set.hh @@ -14,7 +14,7 @@ class StoreSet public: StoreSet(int SSIT_size, int LFST_size); - void violation(Addr load_PC, Addr store_PC); + void violation(Addr store_PC, Addr load_PC); void insertLoad(Addr load_PC, InstSeqNum load_seq_num); diff --git a/cpu/beta_cpu/tournament_pred.cc b/cpu/beta_cpu/tournament_pred.cc new file mode 100644 index 000000000..53a11326a --- /dev/null +++ b/cpu/beta_cpu/tournament_pred.cc @@ -0,0 +1,243 @@ +#include "cpu/beta_cpu/tournament_pred.hh" + +TournamentBP::SatCounter::SatCounter(unsigned bits) + : maxVal((1 << bits) - 1), counter(0) +{ +} + +TournamentBP::SatCounter::SatCounter(unsigned bits, unsigned initial_val) + : maxVal((1 << bits) - 1), counter(initial_val) +{ + // Check to make sure initial value doesn't exceed the max counter value. + if (initial_val > maxVal) { + panic("BP: Initial counter value exceeds max size."); + } +} + +void +TournamentBP::SatCounter::increment() +{ + if (counter < maxVal) { + ++counter; + } +} + +void +TournamentBP::SatCounter::decrement() +{ + if (counter > 0) { + --counter; + } +} + +TournamentBP::TournamentBP(unsigned _local_predictor_size, + unsigned _local_ctr_bits, + unsigned _local_history_table_size, + unsigned _local_history_bits, + unsigned _global_predictor_size, + unsigned _global_ctr_bits, + unsigned _global_history_bits, + unsigned _choice_predictor_size, + unsigned _choice_ctr_bits, + unsigned _instShiftAmt) + : local_predictor_size(_local_predictor_size), + local_ctr_bits(_local_ctr_bits), + local_history_table_size(_local_history_table_size), + local_history_bits(_local_history_bits), + global_predictor_size(_global_predictor_size), + global_ctr_bits(_global_ctr_bits), + global_history_bits(_global_history_bits), + choice_predictor_size(_global_predictor_size), + choice_ctr_bits(_choice_ctr_bits), + instShiftAmt(_instShiftAmt) +{ + //Should do checks here to make sure sizes are correct (powers of 2) + + //Setup the array of counters for the local predictor + local_ctrs = new SatCounter[local_predictor_size](local_ctr_bits); + //Setup the history table for the local table + local_history_table = new unsigned[local_history_table_size](0); + // Setup the local history mask + localHistoryMask = (1 << local_history_bits) - 1; + + //Setup the array of counters for the global predictor + global_ctrs = new SatCounter[global_predictor_size](global_ctr_bits); + //Clear the global history + global_history = 0; + // Setup the global history mask + globalHistoryMask = (1 << global_history_bits) - 1; + + //Setup the array of counters for the choice predictor + choice_ctrs = new SatCounter[choice_predictor_size](choice_ctr_bits); + + threshold = (1 << (local_ctr_bits - 1)) - 1; + threshold = threshold / 2; +} + +inline +unsigned +TournamentBP::calcLocHistIdx(Addr &branch_addr) +{ + return (branch_addr >> instShiftAmt) & (local_history_table_size - 1); +} + +inline +void +TournamentBP::updateHistoriesTaken(unsigned local_history_idx) +{ + global_history = (global_history << 1) | 1; + global_history = global_history & globalHistoryMask; + + local_history_table[local_history_idx] = + (local_history_table[local_history_idx] << 1) | 1; +} + +inline +void +TournamentBP::updateHistoriesNotTaken(unsigned local_history_idx) +{ + global_history = (global_history << 1); + global_history = global_history & globalHistoryMask; + + local_history_table[local_history_idx] = + (local_history_table[local_history_idx] << 1); +} + +bool +TournamentBP::lookup(Addr &branch_addr) +{ + uint8_t local_prediction; + unsigned local_history_idx; + unsigned local_predictor_idx; + + uint8_t global_prediction; + uint8_t choice_prediction; + + //Lookup in the local predictor to get its branch prediction + local_history_idx = calcLocHistIdx(branch_addr); + local_predictor_idx = local_history_table[local_history_idx] + & localHistoryMask; + local_prediction = local_ctrs[local_predictor_idx].read(); + + //Lookup in the global predictor to get its branch prediction + global_prediction = global_ctrs[global_history].read(); + + //Lookup in the choice predictor to see which one to use + choice_prediction = choice_ctrs[global_history].read(); + + //@todo Put a threshold value in for the three predictors that can + // be set through the constructor (so this isn't hard coded). + //Also should put some of this code into functions. + if (choice_prediction > threshold) { + if (global_prediction > threshold) { + updateHistoriesTaken(local_history_idx); + + assert(global_history < global_predictor_size && + local_history_idx < local_predictor_size); + + global_ctrs[global_history].increment(); + local_ctrs[local_history_idx].increment(); + + return true; + } else { + updateHistoriesNotTaken(local_history_idx); + + assert(global_history < global_predictor_size && + local_history_idx < local_predictor_size); + + global_ctrs[global_history].decrement(); + local_ctrs[local_history_idx].decrement(); + + return false; + } + } else { + if (local_prediction > threshold) { + updateHistoriesTaken(local_history_idx); + + assert(global_history < global_predictor_size && + local_history_idx < local_predictor_size); + + global_ctrs[global_history].increment(); + local_ctrs[local_history_idx].increment(); + + return true; + } else { + updateHistoriesNotTaken(local_history_idx); + + assert(global_history < global_predictor_size && + local_history_idx < local_predictor_size); + + global_ctrs[global_history].decrement(); + local_ctrs[local_history_idx].decrement(); + + return false; + } + } +} + +// Update the branch predictor if it predicted a branch wrong. +void +TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken) +{ + + uint8_t local_prediction; + unsigned local_history_idx; + unsigned local_predictor_idx; + bool local_pred_taken; + + uint8_t global_prediction; + bool global_pred_taken; + + // Load the correct global history into the register. + global_history = correct_gh; + + // Get the local predictor's current prediction, remove the incorrect + // update, and update the local predictor + local_history_idx = calcLocHistIdx(branch_addr); + local_predictor_idx = local_history_table[local_history_idx]; + local_predictor_idx = (local_predictor_idx >> 1) & localHistoryMask; + + local_prediction = local_ctrs[local_predictor_idx].read(); + local_pred_taken = local_prediction > threshold; + + //Get the global predictor's current prediction, and update the + //global predictor + global_prediction = global_ctrs[global_history].read(); + global_pred_taken = global_prediction > threshold; + + //Update the choice predictor to tell it which one was correct + if (local_pred_taken != global_pred_taken) { + //If the local prediction matches the actual outcome, decerement + //the counter. Otherwise increment the counter. + if (local_pred_taken == taken) { + choice_ctrs[global_history].decrement(); + } else { + choice_ctrs[global_history].increment(); + } + } + + if (taken) { + assert(global_history < global_predictor_size && + local_predictor_idx < local_predictor_size); + + local_ctrs[local_predictor_idx].increment(); + global_ctrs[global_history].increment(); + + global_history = (global_history << 1) | 1; + global_history = global_history & globalHistoryMask; + + local_history_table[local_history_idx] |= 1; + } + else { + assert(global_history < global_predictor_size && + local_predictor_idx < local_predictor_size); + + local_ctrs[local_predictor_idx].decrement(); + global_ctrs[global_history].decrement(); + + global_history = (global_history << 1); + global_history = global_history & globalHistoryMask; + + local_history_table[local_history_idx] &= ~1; + } +} diff --git a/cpu/beta_cpu/tournament_pred.hh b/cpu/beta_cpu/tournament_pred.hh new file mode 100644 index 000000000..bf87d753b --- /dev/null +++ b/cpu/beta_cpu/tournament_pred.hh @@ -0,0 +1,160 @@ +#ifndef __TOURNAMENT_PRED_HH__ +#define __TOURNAMENT_PRED_HH__ + +// For Addr type. +#include "arch/alpha/isa_traits.hh" + +class TournamentBP +{ + public: + /** + * Default branch predictor constructor. + */ + TournamentBP(unsigned local_predictor_size, + unsigned local_ctr_bits, + unsigned local_history_table_size, + unsigned local_history_bits, + unsigned global_predictor_size, + unsigned global_history_bits, + unsigned global_ctr_bits, + unsigned choice_predictor_size, + unsigned choice_ctr_bits, + unsigned instShiftAmt); + + /** + * Looks up the given address in the branch predictor and returns + * a true/false value as to whether it is taken. + * @param branch_addr The address of the branch to look up. + * @return Whether or not the branch is taken. + */ + bool lookup(Addr &branch_addr); + + /** + * Updates the branch predictor with the actual result of a branch. + * @param branch_addr The address of the branch to update. + * @param taken Whether or not the branch was taken. + */ + void update(Addr &branch_addr, unsigned global_history, bool taken); + + inline unsigned readGlobalHist() { return global_history; } + + private: + + inline bool getPrediction(uint8_t &count); + + inline unsigned calcLocHistIdx(Addr &branch_addr); + + inline void updateHistoriesTaken(unsigned local_history_idx); + + inline void updateHistoriesNotTaken(unsigned local_history_idx); + + /** + * Private counter class for the internal saturating counters. + * Implements an n bit saturating counter and provides methods to + * increment, decrement, and read it. + * @todo Consider making this something that more closely mimics a + * built in class so you can use ++ or --. + */ + class SatCounter + { + public: + /** + * Constructor for the counter. + * @param bits How many bits the counter will have. + */ + SatCounter(unsigned bits); + + /** + * Constructor for the counter. + * @param bits How many bits the counter will have. + * @param initial_val Starting value for each counter. + */ + SatCounter(unsigned bits, unsigned initial_val); + + /** + * Increments the counter's current value. + */ + void increment(); + + /** + * Decrements the counter's current value. + */ + void decrement(); + + /** + * Read the counter's value. + */ + uint8_t read() + { + return counter; + } + + private: + uint8_t maxVal; + uint8_t counter; + }; + + /** Local counters. */ + SatCounter *local_ctrs; + + /** Size of the local predictor. */ + unsigned local_predictor_size; + + /** Number of bits of the local predictor's counters. */ + unsigned local_ctr_bits; + + /** Array of local history table entries. */ + unsigned *local_history_table; + + /** Size of the local history table. */ + unsigned local_history_table_size; + + /** Number of bits for each entry of the local history table. + * @todo Doesn't this come from the size of the local predictor? + */ + unsigned local_history_bits; + + /** Mask to get the proper local history. */ + unsigned localHistoryMask; + + + /** Array of counters that make up the global predictor. */ + SatCounter *global_ctrs; + + /** Size of the global predictor. */ + unsigned global_predictor_size; + + /** Number of bits of the global predictor's counters. */ + unsigned global_ctr_bits; + + /** Global history register. */ + unsigned global_history; + + /** Number of bits for the global history. */ + unsigned global_history_bits; + + /** Mask to get the proper global history. */ + unsigned globalHistoryMask; + + + /** Array of counters that make up the choice predictor. */ + SatCounter *choice_ctrs; + + /** Size of the choice predictor (identical to the global predictor). */ + unsigned choice_predictor_size; + + /** Number of bits of the choice predictor's counters. */ + unsigned choice_ctr_bits; + + /** Number of bits to shift the instruction over to get rid of the word + * offset. + */ + unsigned instShiftAmt; + + /** Threshold for the counter value; above the threshold is taken, + * equal to or below the threshold is not taken. + */ + unsigned threshold; +}; + +#endif // __TOURNAMENT_PRED_HH__ |