diff options
36 files changed, 786 insertions, 152 deletions
diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc index 458fbd663..eab98531d 100644 --- a/cpu/o3/2bit_local_pred.cc +++ b/cpu/o3/2bit_local_pred.cc @@ -67,6 +67,14 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize, instShiftAmt); } +void +DefaultBP::reset() +{ + for (int i = 0; i < localPredictorSets; ++i) { + localCtrs[i].reset(); + } +} + bool DefaultBP::lookup(Addr &branch_addr) { diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh index 38d3f4842..0dfe53819 100644 --- a/cpu/o3/2bit_local_pred.hh +++ b/cpu/o3/2bit_local_pred.hh @@ -62,6 +62,8 @@ class DefaultBP */ void update(Addr &branch_addr, bool taken); + void reset(); + private: /** diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh index 86f7d9f28..7a2d5d2b9 100644 --- a/cpu/o3/alpha_cpu_impl.hh +++ b/cpu/o3/alpha_cpu_impl.hh @@ -151,6 +151,26 @@ template <class Impl> void AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context) { + // some things should already be set up + assert(getMemPtr() == old_context->getMemPtr()); +#if FULL_SYSTEM + assert(getSystemPtr() == old_context->getSystemPtr()); +#else + assert(getProcessPtr() == old_context->getProcessPtr()); +#endif + + // copy over functional state + setStatus(old_context->status()); + copyArchRegs(old_context); + setCpuId(old_context->readCpuId()); +#if !FULL_SYSTEM + thread->funcExeInst = old_context->readFuncExeInst(); +#endif + + old_context->setStatus(ExecContext::Unallocated); + + thread->inSyscall = false; + thread->trapPending = false; } template <class Impl> diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh index 67c300989..ee7ffc183 100644 --- a/cpu/o3/bpred_unit.hh +++ b/cpu/o3/bpred_unit.hh @@ -67,6 +67,10 @@ class TwobitBPredUnit */ void regStats(); + void switchOut(); + + void takeOverFrom(); + /** * Predicts whether or not the instruction is a taken branch, and the * target of the branch if it is taken. diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh index f79b67b6c..872c0c62e 100644 --- a/cpu/o3/bpred_unit_impl.hh +++ b/cpu/o3/bpred_unit_impl.hh @@ -95,6 +95,26 @@ TwobitBPredUnit<Impl>::regStats() } template <class Impl> +void +TwobitBPredUnit<Impl>::switchOut() +{ + for (int i = 0; i < Impl::MaxThreads; ++i) { + predHist[i].clear(); + } +} + +template <class Impl> +void +TwobitBPredUnit<Impl>::takeOverFrom() +{ + for (int i = 0; i < Impl::MaxThreads; ++i) + RAS[i].reset(); + + BP.reset(); + BTB.reset(); +} + +template <class Impl> bool TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid) { @@ -297,5 +317,6 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, BP.update(pred_hist.front().PC, actually_taken); BTB.update(pred_hist.front().PC, corr_target, tid); + pred_hist.pop_front(); } } diff --git a/cpu/o3/btb.cc b/cpu/o3/btb.cc index e084142d7..e5f69043a 100644 --- a/cpu/o3/btb.cc +++ b/cpu/o3/btb.cc @@ -58,6 +58,14 @@ DefaultBTB::DefaultBTB(unsigned _numEntries, tagShiftAmt = instShiftAmt + floorLog2(numEntries); } +void +DefaultBTB::reset() +{ + for (int i = 0; i < numEntries; ++i) { + btb[i].valid = false; + } +} + inline unsigned DefaultBTB::getIndex(const Addr &inst_PC) diff --git a/cpu/o3/btb.hh b/cpu/o3/btb.hh index aaa9945f7..b9ff42573 100644 --- a/cpu/o3/btb.hh +++ b/cpu/o3/btb.hh @@ -65,6 +65,8 @@ class DefaultBTB DefaultBTB(unsigned numEntries, unsigned tagBits, unsigned instShiftAmt); + void reset(); + /** Looks up an address in the BTB. Must call valid() first on the address. * @param inst_PC The address of the branch to look up. * @param tid The thread id. diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh index f374b8fb7..028bd5295 100644 --- a/cpu/o3/commit.hh +++ b/cpu/o3/commit.hh @@ -175,6 +175,10 @@ class DefaultCommit /** Initializes stage by sending back the number of free entries. */ void initStage(); + void switchOut(); + + void takeOverFrom(); + /** Ticks the commit stage, which tries to commit instructions. */ void tick(); @@ -351,6 +355,8 @@ class DefaultCommit /** Number of Active Threads */ unsigned numThreads; + bool switchedOut; + Tick trapLatency; Tick fetchTrapLatency; diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh index 157e688c7..7834460e2 100644 --- a/cpu/o3/commit_impl.hh +++ b/cpu/o3/commit_impl.hh @@ -54,6 +54,7 @@ template <class Impl> void DefaultCommit<Impl>::TrapEvent::process() { + // This will get reset if it was switched out. commit->trapSquash[tid] = true; } @@ -75,7 +76,8 @@ DefaultCommit<Impl>::DefaultCommit(Params *params) renameWidth(params->renameWidth), iewWidth(params->executeWidth), commitWidth(params->commitWidth), - numThreads(params->numberOfThreads) + numThreads(params->numberOfThreads), + switchedOut(false) { _status = Active; _nextStatus = Inactive; @@ -254,6 +256,9 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr) // Commit must broadcast the number of free entries it has at the start of // the simulation, so it starts as active. cpu->activateStage(FullCPU::CommitIdx); + + trapLatency = cpu->cycles(6); + fetchTrapLatency = cpu->cycles(12); } template <class Impl> @@ -362,6 +367,29 @@ DefaultCommit<Impl>::initStage() template <class Impl> void +DefaultCommit<Impl>::switchOut() +{ + rob->switchOut(); +} + +template <class Impl> +void +DefaultCommit<Impl>::takeOverFrom() +{ + _status = Active; + _nextStatus = Inactive; + for (int i=0; i < numThreads; i++) { + commitStatus[i] = Idle; + changedROBNumEntries[i] = false; + trapSquash[i] = false; + xcSquash[i] = false; + } + squashCounter = 0; + rob->takeOverFrom(); +} + +template <class Impl> +void DefaultCommit<Impl>::updateStatus() { if (commitStatus[0] == TrapPending || @@ -719,8 +747,9 @@ DefaultCommit<Impl>::commit() while (threads != (*activeThreads).end()) { unsigned tid = *threads++; - if (fromFetch->fetchFault) { + if (fromFetch->fetchFault && commitStatus[0] != TrapPending) { // Record the fault. Wait until it's empty in the ROB. Then handle the trap. + // Ignore it if there's already a trap pending as fetch will be redirected. fetchFault = fromFetch->fetchFault; fetchFaultSN = fromFetch->fetchFaultSN; fetchFaultTick = curTick + fetchTrapLatency; @@ -975,6 +1004,7 @@ DefaultCommit<Impl>::commitInsts() } PC[tid] = nextPC[tid]; + nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst); #if FULL_SYSTEM int count = 0; Addr oldpc; @@ -1002,6 +1032,10 @@ DefaultCommit<Impl>::commitInsts() DPRINTF(CommitRate, "%i\n", num_committed); numCommittedDist.sample(num_committed); + + if (num_committed == commitWidth) { + commit_eligible[0]++; + } } template <class Impl> diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc index ac8c4236e..fc8372026 100644 --- a/cpu/o3/cpu.cc +++ b/cpu/o3/cpu.cc @@ -124,6 +124,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params) mem(params->mem), #else // pTable(params->pTable), + mem(params->workload[0]->getMemory()), #endif // FULL_SYSTEM icacheInterface(params->icacheInterface), @@ -176,9 +177,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params) numThreads = number_of_threads; #if !FULL_SYSTEM - int activeThreads = params->workload.size(); + int active_threads = params->workload.size(); #else - int activeThreads = 1; + int active_threads = 1; #endif assert(params->numPhysIntRegs >= numThreads * TheISA::NumIntRegs); @@ -192,7 +193,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params) PhysRegIndex freg_idx = params->numPhysIntRegs; //Index to 1 after int regs for (int tid=0; tid < numThreads; tid++) { - bool bindRegs = (tid <= activeThreads - 1); + bool bindRegs = (tid <= active_threads - 1); commitRenameMap[tid].init(TheISA::NumIntRegs, params->numPhysIntRegs, @@ -357,7 +358,7 @@ FullO3CPU<Impl>::tick() } if (activityCount && !tickEvent.scheduled()) { - tickEvent.schedule(curTick + 1); + tickEvent.schedule(curTick + cycles(1)); } #if !FULL_SYSTEM @@ -370,8 +371,8 @@ template <class Impl> void FullO3CPU<Impl>::init() { - if (deferRegistration) { - return; + if (!deferRegistration) { + registerExecContexts(); } // Set inSyscall so that the CPU doesn't squash when initially @@ -379,7 +380,6 @@ FullO3CPU<Impl>::init() for (int i = 0; i < number_of_threads; ++i) thread[i]->inSyscall = true; - registerExecContexts(); // Need to do a copy of the xc->regs into the CPU's regfile so // that it can start properly. @@ -388,7 +388,7 @@ FullO3CPU<Impl>::init() // Need to do a copy of the xc->regs into the CPU's regfile so // that it can start properly. #if FULL_SYSTEM - ExecContext *src_xc = system->execContexts[tid]; + ExecContext *src_xc = execContexts[tid]; #else ExecContext *src_xc = thread[tid]->getXCProxy(); #endif @@ -584,7 +584,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay) activeThreads.push_back(tid); } - assert(_status == Idle); + assert(_status == Idle || _status == SwitchedOut); scheduleTickEvent(delay); @@ -658,21 +658,64 @@ FullO3CPU<Impl>::haltContext(int tid) template <class Impl> void -FullO3CPU<Impl>::switchOut() +FullO3CPU<Impl>::switchOut(Sampler *sampler) { - panic("FullO3CPU does not have a switch out function.\n"); +// panic("FullO3CPU does not have a switch out function.\n"); + fetch.switchOut(); + decode.switchOut(); + rename.switchOut(); + iew.switchOut(); + commit.switchOut(); + if (tickEvent.scheduled()) + tickEvent.squash(); + sampler->signalSwitched(); + _status = SwitchedOut; } template <class Impl> void FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU) { + for (int i = 0; i < 6; ++i) { + timeBuffer.advance(); + fetchQueue.advance(); + decodeQueue.advance(); + renameQueue.advance(); + iewQueue.advance(); + activityBuffer.advance(); + } + + activityCount = 0; + bzero(&stageActive, sizeof(stageActive)); + BaseCPU::takeOverFrom(oldCPU); + fetch.takeOverFrom(); + decode.takeOverFrom(); + rename.takeOverFrom(); + iew.takeOverFrom(); + commit.takeOverFrom(); + assert(!tickEvent.scheduled()); + // @todo: Figure out how to properly select the tid to put onto the active threads list. + int tid = 0; + + list<unsigned>::iterator isActive = find( + activeThreads.begin(), activeThreads.end(), tid); + + if (isActive == activeThreads.end()) { + //May Need to Re-code this if the delay variable is the + //delay needed for thread to activate + DPRINTF(FullCPU, "Adding Thread %i to active threads list\n", + tid); + + activeThreads.push_back(tid); + } + // Set all status's to active, schedule the // CPU's tick event. + // @todo: Fix up statuses so this is handled properly for (int i = 0; i < execContexts.size(); ++i) { ExecContext *xc = execContexts[i]; if (xc->status() == ExecContext::Active && _status != Running) { @@ -680,6 +723,8 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU) tickEvent.schedule(curTick); } } + if (!tickEvent.scheduled()) + tickEvent.schedule(curTick); } template <class Impl> @@ -758,7 +803,8 @@ template <class Impl> float FullO3CPU<Impl>::readArchFloatRegSingle(int reg_idx, unsigned tid) { - PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx); + int idx = reg_idx + TheISA::FP_Base_DepTag; + PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx); return regFile.readFloatRegSingle(phys_reg); } @@ -767,7 +813,8 @@ template <class Impl> double FullO3CPU<Impl>::readArchFloatRegDouble(int reg_idx, unsigned tid) { - PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx); + int idx = reg_idx + TheISA::FP_Base_DepTag; + PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx); return regFile.readFloatRegDouble(phys_reg); } @@ -776,7 +823,8 @@ template <class Impl> uint64_t FullO3CPU<Impl>::readArchFloatRegInt(int reg_idx, unsigned tid) { - PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx); + int idx = reg_idx + TheISA::FP_Base_DepTag; + PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx); return regFile.readFloatRegInt(phys_reg); } diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh index 91eaf9d6f..621ddf541 100644 --- a/cpu/o3/cpu.hh +++ b/cpu/o3/cpu.hh @@ -82,7 +82,8 @@ class FullO3CPU : public BaseFullCPU Running, Idle, Halted, - Blocked + Blocked, + SwitchedOut }; /** Overall CPU status. */ @@ -112,9 +113,9 @@ class FullO3CPU : public BaseFullCPU void scheduleTickEvent(int delay) { if (tickEvent.squashed()) - tickEvent.reschedule(curTick + delay); + tickEvent.reschedule(curTick + cycles(delay)); else if (!tickEvent.scheduled()) - tickEvent.schedule(curTick + delay); + tickEvent.schedule(curTick + cycles(delay)); } /** Unschedule tick event, regardless of its current state. */ @@ -196,7 +197,7 @@ class FullO3CPU : public BaseFullCPU /** Switches out this CPU. * @todo: Implement this. */ - void switchOut(); + void switchOut(Sampler *sampler); /** Takes over from another CPU. * @todo: Implement this. diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh index 279ff556e..3f3f68247 100644 --- a/cpu/o3/decode.hh +++ b/cpu/o3/decode.hh @@ -107,6 +107,9 @@ class DefaultDecode /** Sets pointer to list of active threads. */ void setActiveThreads(std::list<unsigned> *at_ptr); + void switchOut(); + + void takeOverFrom(); /** Ticks decode, processing all input signals and decoding as many * instructions as possible. */ @@ -272,6 +275,8 @@ class DefaultDecode Stats::Scalar<> decodeUnblockCycles; /** Stat for total number of squashing cycles. */ Stats::Scalar<> decodeSquashCycles; + /** Stat for number of times a branch is resolved at decode. */ + Stats::Scalar<> decodeBranchResolved; /** Stat for number of times a branch mispredict is detected. */ Stats::Scalar<> decodeBranchMispred; /** Stat for number of times decode detected a non-control instruction diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh index f1aea27b4..caa97067b 100644 --- a/cpu/o3/decode_impl.hh +++ b/cpu/o3/decode_impl.hh @@ -66,40 +66,44 @@ void DefaultDecode<Impl>::regStats() { decodeIdleCycles - .name(name() + ".decodeIdleCycles") + .name(name() + ".DECODE:IdleCycles") .desc("Number of cycles decode is idle") .prereq(decodeIdleCycles); decodeBlockedCycles - .name(name() + ".decodeBlockedCycles") + .name(name() + ".DECODE:BlockedCycles") .desc("Number of cycles decode is blocked") .prereq(decodeBlockedCycles); decodeRunCycles - .name(name() + ".decodeRunCycles") + .name(name() + ".DECODE:RunCycles") .desc("Number of cycles decode is running") .prereq(decodeRunCycles); decodeUnblockCycles - .name(name() + ".decodeUnblockCycles") + .name(name() + ".DECODE:UnblockCycles") .desc("Number of cycles decode is unblocking") .prereq(decodeUnblockCycles); decodeSquashCycles - .name(name() + ".decodeSquashCycles") + .name(name() + ".DECODE:SquashCycles") .desc("Number of cycles decode is squashing") .prereq(decodeSquashCycles); + decodeBranchResolved + .name(name() + ".DECODE:BranchResolved") + .desc("Number of times decode resolved a branch") + .prereq(decodeBranchResolved); decodeBranchMispred - .name(name() + ".decodeBranchMispred") + .name(name() + ".DECODE:BranchMispred") .desc("Number of times decode detected a branch misprediction") .prereq(decodeBranchMispred); decodeControlMispred - .name(name() + ".decodeControlMispred") + .name(name() + ".DECODE:ControlMispred") .desc("Number of times decode detected an instruction incorrectly" " predicted as a control") .prereq(decodeControlMispred); decodeDecodedInsts - .name(name() + ".decodeDecodedInsts") + .name(name() + ".DECODE:DecodedInsts") .desc("Number of instructions handled by decode") .prereq(decodeDecodedInsts); decodeSquashedInsts - .name(name() + ".decodeSquashedInsts") + .name(name() + ".DECODE:SquashedInsts") .desc("Number of squashed instructions handled by decode") .prereq(decodeSquashedInsts); } @@ -158,6 +162,33 @@ DefaultDecode<Impl>::setActiveThreads(list<unsigned> *at_ptr) activeThreads = at_ptr; } +template <class Impl> +void +DefaultDecode<Impl>::switchOut() +{ +} + +template <class Impl> +void +DefaultDecode<Impl>::takeOverFrom() +{ + _status = Inactive; + + for (int i = 0; i < numThreads; ++i) { + decodeStatus[i] = Idle; + + stalls[i].rename = false; + stalls[i].iew = false; + stalls[i].commit = false; + while (!insts[i].empty()) + insts[i].pop(); + while (!skidBuffer[i].empty()) + skidBuffer[i].pop(); + branchCount[i] = 0; + } + wroteToTimeBuffer = false; +} + template<class Impl> bool DefaultDecode<Impl>::checkStall(unsigned tid) const @@ -680,6 +711,7 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid) // Go ahead and compute any PC-relative branches. if (inst->isDirectCtrl() && inst->isUncondCtrl()) { + ++decodeBranchResolved; inst->setNextPC(inst->branchTarget()); if (inst->mispredicted()) { diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh index f0b15cb86..6074831c6 100644 --- a/cpu/o3/fetch.hh +++ b/cpu/o3/fetch.hh @@ -35,6 +35,8 @@ #include "mem/mem_interface.hh" #include "sim/eventq.hh" +class Sampler; + /** * DefaultFetch class handles both single threaded and SMT fetch. Its width is * specified by the parameters; each cycle it tries to fetch that many @@ -81,6 +83,7 @@ class DefaultFetch Fetching, TrapPending, QuiescePending, + SwitchOut, IcacheMissStall, IcacheMissComplete }; @@ -160,6 +163,12 @@ class DefaultFetch /** Processes cache completion event. */ void processCacheCompletion(MemReqPtr &req); + void switchOut(); + + void takeOverFrom(); + + bool isSwitchedOut() { return switchedOut; } + void wakeFromQuiesce(); private: @@ -360,6 +369,8 @@ class DefaultFetch bool interruptPending; + bool switchedOut; + #if !FULL_SYSTEM /** Page table pointer. */ // PageTable *pTable; @@ -382,6 +393,8 @@ class DefaultFetch */ Stats::Scalar<> fetchIdleCycles; Stats::Scalar<> fetchBlockedCycles; + + Stats::Scalar<> fetchMiscStallCycles; /** Stat for total number of fetched cache lines. */ Stats::Scalar<> fetchedCacheLines; diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh index 563a767df..92f923c65 100644 --- a/cpu/o3/fetch_impl.hh +++ b/cpu/o3/fetch_impl.hh @@ -169,53 +169,59 @@ void DefaultFetch<Impl>::regStats() { icacheStallCycles - .name(name() + ".icacheStallCycles") + .name(name() + ".FETCH:icacheStallCycles") .desc("Number of cycles fetch is stalled on an Icache miss") .prereq(icacheStallCycles); fetchedInsts - .name(name() + ".fetchedInsts") + .name(name() + ".FETCH:Insts") .desc("Number of instructions fetch has processed") .prereq(fetchedInsts); fetchedBranches - .name(name() + ".fetchedBranches") + .name(name() + ".FETCH:Branches") .desc("Number of branches that fetch encountered") .prereq(fetchedBranches); predictedBranches - .name(name() + ".predictedBranches") + .name(name() + ".FETCH:predictedBranches") .desc("Number of branches that fetch has predicted taken") .prereq(predictedBranches); fetchCycles - .name(name() + ".fetchCycles") + .name(name() + ".FETCH:Cycles") .desc("Number of cycles fetch has run and was not squashing or" " blocked") .prereq(fetchCycles); fetchSquashCycles - .name(name() + ".fetchSquashCycles") + .name(name() + ".FETCH:SquashCycles") .desc("Number of cycles fetch has spent squashing") .prereq(fetchSquashCycles); fetchIdleCycles - .name(name() + ".fetchIdleCycles") + .name(name() + ".FETCH:IdleCycles") .desc("Number of cycles fetch was idle") .prereq(fetchIdleCycles); fetchBlockedCycles - .name(name() + ".fetchBlockedCycles") + .name(name() + ".FETCH:BlockedCycles") .desc("Number of cycles fetch has spent blocked") .prereq(fetchBlockedCycles); fetchedCacheLines - .name(name() + ".fetchedCacheLines") + .name(name() + ".FETCH:CacheLines") .desc("Number of cache lines fetched") .prereq(fetchedCacheLines); + fetchMiscStallCycles + .name(name() + ".FETCH:MiscStallCycles") + .desc("Number of cycles fetch has spent waiting on interrupts, or " + "bad addresses, or out of MSHRs") + .prereq(fetchMiscStallCycles); + fetchIcacheSquashes - .name(name() + ".fetchIcacheSquashes") + .name(name() + ".FETCH:IcacheSquashes") .desc("Number of outstanding Icache misses that were squashed") .prereq(fetchIcacheSquashes); @@ -223,24 +229,24 @@ DefaultFetch<Impl>::regStats() .init(/* base value */ 0, /* last value */ fetchWidth, /* bucket size */ 1) - .name(name() + ".rateDist") + .name(name() + ".FETCH:rateDist") .desc("Number of instructions fetched each cycle (Total)") .flags(Stats::pdf); idleRate - .name(name() + ".idleRate") + .name(name() + ".FETCH:idleRate") .desc("Percent of cycles fetch was idle") .prereq(idleRate); idleRate = fetchIdleCycles * 100 / cpu->numCycles; branchRate - .name(name() + ".branchRate") + .name(name() + ".FETCH:branchRate") .desc("Number of branch fetches per cycle") .flags(Stats::total); branchRate = predictedBranches / cpu->numCycles; fetchRate - .name(name() + ".rate") + .name(name() + ".FETCH:rate") .desc("Number of inst fetches per cycle") .flags(Stats::total); fetchRate = fetchedInsts / cpu->numCycles; @@ -332,7 +338,8 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req) // Can keep track of how many cache accesses go unused due to // misspeculation here. if (fetchStatus[tid] != IcacheMissStall || - req != memReq[tid]) { + req != memReq[tid] || + isSwitchedOut()) { ++fetchIcacheSquashes; return; } @@ -362,6 +369,35 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req) template <class Impl> void +DefaultFetch<Impl>::switchOut() +{ + switchedOut = true; + branchPred.switchOut(); +} + +template <class Impl> +void +DefaultFetch<Impl>::takeOverFrom() +{ + // Reset all state + for (int i = 0; i < Impl::MaxThreads; ++i) { + stalls[i].decode = 0; + stalls[i].rename = 0; + stalls[i].iew = 0; + stalls[i].commit = 0; + PC[i] = cpu->readPC(i); + nextPC[i] = cpu->readNextPC(i); + fetchStatus[i] = Running; + } + numInst = 0; + wroteToTimeBuffer = false; + _status = Inactive; + switchedOut = false; + branchPred.takeOverFrom(); +} + +template <class Impl> +void DefaultFetch<Impl>::wakeFromQuiesce() { DPRINTF(Fetch, "Waking up from quiesce\n"); @@ -902,8 +938,10 @@ DefaultFetch<Impl>::fetch(bool &status_change) tid, fetch_PC); bool fetch_success = fetchCacheLine(fetch_PC, fault, tid); - if (!fetch_success) + if (!fetch_success) { + ++fetchMiscStallCycles; return; + } } else { if (fetchStatus[tid] == Idle) { ++fetchIdleCycles; diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc index 9b6ac15d9..cb7a15061 100644 --- a/cpu/o3/fu_pool.cc +++ b/cpu/o3/fu_pool.cc @@ -242,6 +242,20 @@ FUPool::dump() } } +void +FUPool::switchOut() +{ +} + +void +FUPool::takeOverFrom() +{ + for (int i = 0; i < numFU; i++) { + unitBusy[i] = false; + } + unitsToBeFreed.clear(); +} + // //////////////////////////////////////////////////////////////////////////// diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh index d7b7acadb..7df5ad5f3 100644 --- a/cpu/o3/fu_pool.hh +++ b/cpu/o3/fu_pool.hh @@ -154,6 +154,9 @@ class FUPool : public SimObject unsigned getIssueLatency(OpClass capability) { return maxIssueLatencies[capability]; } + + void switchOut(); + void takeOverFrom(); }; #endif // __CPU_O3_FU_POOL_HH__ diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh index 58cd68b21..ae0ba6a21 100644 --- a/cpu/o3/iew.hh +++ b/cpu/o3/iew.hh @@ -157,6 +157,12 @@ class DefaultIEW /** Sets pointer to the scoreboard. */ void setScoreboard(Scoreboard *sb_ptr); + void switchOut(); + + void takeOverFrom(); + + bool isSwitchedOut() { return switchedOut; } + /** Sets page table pointer within LSQ. */ // void setPageTable(PageTable *pt_ptr); @@ -420,6 +426,8 @@ class DefaultIEW /** Maximum size of the skid buffer. */ unsigned skidBufferMax; + bool switchedOut; + /** Stat for total number of idle cycles. */ Stats::Scalar<> iewIdleCycles; /** Stat for total number of squashing cycles. */ diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh index 2ae2e1361..42d83ee72 100644 --- a/cpu/o3/iew_impl.hh +++ b/cpu/o3/iew_impl.hh @@ -55,13 +55,13 @@ DefaultIEW<Impl>::LdWritebackEvent::process() //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum); - iewStage->wakeCPU(); - - if (inst->isSquashed()) { + if (inst->isSquashed() || iewStage->isSwitchedOut()) { inst = NULL; return; } + iewStage->wakeCPU(); + if (!inst->isExecuted()) { inst->setExecuted(); @@ -101,7 +101,8 @@ DefaultIEW<Impl>::DefaultIEW(Params *params) issueReadWidth(params->issueWidth), issueWidth(params->issueWidth), executeWidth(params->executeWidth), - numThreads(params->numberOfThreads) + numThreads(params->numberOfThreads), + switchedOut(false) { DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth); _status = Active; @@ -436,6 +437,53 @@ DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr) } #endif +template <class Impl> +void +DefaultIEW<Impl>::switchOut() +{ + switchedOut = true; + instQueue.switchOut(); + ldstQueue.switchOut(); + fuPool->switchOut(); + + for (int i = 0; i < numThreads; i++) { + while (!insts[i].empty()) + insts[i].pop(); + while (!skidBuffer[i].empty()) + skidBuffer[i].pop(); + } +} + +template <class Impl> +void +DefaultIEW<Impl>::takeOverFrom() +{ + _status = Active; + exeStatus = Running; + wbStatus = Idle; + switchedOut = false; + + instQueue.takeOverFrom(); + ldstQueue.takeOverFrom(); + fuPool->takeOverFrom(); + + initStage(); + cpu->activityThisCycle(); + + for (int i=0; i < numThreads; i++) { + dispatchStatus[i] = Running; + stalls[i].commit = false; + fetchRedirect[i] = false; + } + + updateLSQNextCycle = false; + + // @todo: Fix hardcoded number + for (int i = 0; i < 6; ++i) { + issueToExecQueue.advance(); + } +} + template<class Impl> void DefaultIEW<Impl>::squash(unsigned tid) diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh index 06d9937f2..982294b4f 100644 --- a/cpu/o3/inst_queue.hh +++ b/cpu/o3/inst_queue.hh @@ -112,6 +112,10 @@ class InstructionQueue /** Registers statistics. */ void regStats(); + void resetState(); + + void resetDependencyGraph(); + /** Sets CPU pointer. */ void setCPU(FullCPU *_cpu) { cpu = _cpu; } @@ -127,6 +131,12 @@ class InstructionQueue /** Sets the global time buffer. */ void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr); + void switchOut(); + + void takeOverFrom(); + + bool isSwitchedOut() { return switchedOut; } + /** Number of entries needed for given amount of threads. */ int entryAmount(int num_threads); @@ -385,6 +395,8 @@ class InstructionQueue */ unsigned commitToIEWDelay; + bool switchedOut; + ////////////////////////////////// // Variables needed for squashing ////////////////////////////////// @@ -507,7 +519,7 @@ class InstructionQueue Stats::Scalar<> iqSquashedNonSpecRemoved; Stats::VectorDistribution<> queue_res_dist; - Stats::Vector<> n_issued_dist; + Stats::Distribution<> n_issued_dist; Stats::VectorDistribution<> issue_delay_dist; Stats::Vector<> stat_fu_busy; diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh index 804bc2472..0d9cc09f3 100644 --- a/cpu/o3/inst_queue_impl.hh +++ b/cpu/o3/inst_queue_impl.hh @@ -82,15 +82,9 @@ InstructionQueue<Impl>::InstructionQueue(Params *params) { assert(fuPool); - numThreads = params->numberOfThreads; + switchedOut = false; - //Initialize thread IQ counts - for (int i = 0; i <numThreads; i++) { - count[i] = 0; - } - - // Initialize the number of free IQ entries. - freeEntries = numEntries; + numThreads = params->numberOfThreads; // Set the number of physical registers as the number of int + float numPhysRegs = numPhysIntRegs + numPhysFloatRegs; @@ -101,37 +95,24 @@ InstructionQueue<Impl>::InstructionQueue(Params *params) //dependency graph. dependGraph = new DependencyEntry[numPhysRegs]; - // Resize the register scoreboard. - regScoreboard.resize(numPhysRegs); - - //Initialize Mem Dependence Units - for (int i = 0; i < numThreads; i++) { - memDepUnit[i].init(params,i); - memDepUnit[i].setIQ(this); - } - // Initialize all the head pointers to point to NULL, and all the // entries as unready. - // Note that in actuality, the registers corresponding to the logical - // registers start off as ready. However this doesn't matter for the - // IQ as the instruction should have been correctly told if those - // registers are ready in rename. Thus it can all be initialized as - // unready. for (int i = 0; i < numPhysRegs; ++i) { dependGraph[i].next = NULL; dependGraph[i].inst = NULL; - regScoreboard[i] = false; } - for (int i = 0; i < numThreads; ++i) { - squashedSeqNum[i] = 0; - } + // Resize the register scoreboard. + regScoreboard.resize(numPhysRegs); - for (int i = 0; i < Num_OpClasses; ++i) { - queueOnList[i] = false; - readyIt[i] = listOrder.end(); + //Initialize Mem Dependence Units + for (int i = 0; i < numThreads; i++) { + memDepUnit[i].init(params,i); + memDepUnit[i].setIQ(this); } + resetState(); + string policy = params->smtIQPolicy; //Convert string to lowercase @@ -184,30 +165,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params) template <class Impl> InstructionQueue<Impl>::~InstructionQueue() { - // Clear the dependency graph - DependencyEntry *curr; - DependencyEntry *prev; - - for (int i = 0; i < numPhysRegs; ++i) { - curr = dependGraph[i].next; - - while (curr) { - DependencyEntry::mem_alloc_counter--; - - prev = curr; - curr = prev->next; - prev->inst = NULL; - - delete prev; - } - - if (dependGraph[i].inst) { - dependGraph[i].inst = NULL; - } - - dependGraph[i].next = NULL; - } - + resetDependencyGraph(); assert(DependencyEntry::mem_alloc_counter == 0); delete [] dependGraph; @@ -307,10 +265,10 @@ InstructionQueue<Impl>::regStats() queue_res_dist.subname(i, opClassStrings[i]); } n_issued_dist - .init(totalWidth + 1) + .init(0,totalWidth,1) .name(name() + ".ISSUE:issued_per_cycle") .desc("Number of insts issued each cycle") - .flags(total | pdf | dist) + .flags(pdf) ; /* dist_unissued @@ -402,6 +360,71 @@ InstructionQueue<Impl>::regStats() template <class Impl> void +InstructionQueue<Impl>::resetState() +{ + //Initialize thread IQ counts + for (int i = 0; i <numThreads; i++) { + count[i] = 0; + instList[i].clear(); + } + + // Initialize the number of free IQ entries. + freeEntries = numEntries; + + // Note that in actuality, the registers corresponding to the logical + // registers start off as ready. However this doesn't matter for the + // IQ as the instruction should have been correctly told if those + // registers are ready in rename. Thus it can all be initialized as + // unready. + for (int i = 0; i < numPhysRegs; ++i) { + regScoreboard[i] = false; + } + + for (int i = 0; i < numThreads; ++i) { + squashedSeqNum[i] = 0; + } + + for (int i = 0; i < Num_OpClasses; ++i) { + while (!readyInsts[i].empty()) + readyInsts[i].pop(); + queueOnList[i] = false; + readyIt[i] = listOrder.end(); + } + nonSpecInsts.clear(); + listOrder.clear(); +} + +template <class Impl> +void +InstructionQueue<Impl>::resetDependencyGraph() +{ + // Clear the dependency graph + DependencyEntry *curr; + DependencyEntry *prev; + + for (int i = 0; i < numPhysRegs; ++i) { + curr = dependGraph[i].next; + + while (curr) { + DependencyEntry::mem_alloc_counter--; + + prev = curr; + curr = prev->next; + prev->inst = NULL; + + delete prev; + } + + if (dependGraph[i].inst) { + dependGraph[i].inst = NULL; + } + + dependGraph[i].next = NULL; + } +} + +template <class Impl> +void InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr) { DPRINTF(IQ, "Setting active threads list pointer.\n"); @@ -427,6 +450,25 @@ InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr) } template <class Impl> +void +InstructionQueue<Impl>::switchOut() +{ + resetState(); + resetDependencyGraph(); + switchedOut = true; + for (int i = 0; i < numThreads; ++i) { + memDepUnit[i].switchOut(); + } +} + +template <class Impl> +void +InstructionQueue<Impl>::takeOverFrom() +{ + switchedOut = false; +} + +template <class Impl> int InstructionQueue<Impl>::entryAmount(int num_threads) { @@ -685,6 +727,10 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx) { // The CPU could have been sleeping until this op completed (*extremely* // long latency op). Wake it if it was. This may be overkill. + if (isSwitchedOut()) { + return; + } + iewStage->wakeCPU(); fuPool->freeUnit(fu_idx); @@ -816,7 +862,7 @@ InstructionQueue<Impl>::scheduleReadyInsts() FUCompletion *execution = new FUCompletion(issuing_inst, idx, this); - execution->schedule(curTick + issue_latency - 1); + execution->schedule(curTick + cpu->cycles(issue_latency - 1)); } else { i2e_info->insts[exec_queue_slot++] = issuing_inst; i2e_info->size++; @@ -862,6 +908,8 @@ InstructionQueue<Impl>::scheduleReadyInsts() } } + n_issued_dist.sample(total_issued); + if (total_issued) { cpu->activityThisCycle(); } else { diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh index c59b5f13b..d5f893e57 100644 --- a/cpu/o3/lsq.hh +++ b/cpu/o3/lsq.hh @@ -71,6 +71,9 @@ class LSQ { /** Sets the page table pointer. */ // void setPageTable(PageTable *pt_ptr); + void switchOut(); + void takeOverFrom(); + /** Number of entries needed for the given amount of threads.*/ int entryAmount(int num_threads); void removeEntries(unsigned tid); @@ -271,15 +274,6 @@ class LSQ { /** Max SQ Size - Used to Enforce Sharing Policies. */ unsigned maxSQEntries; - /** Global Load Count. */ - int loads; - - /** Global Store Count */ - int stores; - - /** Global Store To WB Count */ - int storesToWB; - /** Number of Threads. */ unsigned numThreads; }; diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh index 523517869..c43c19619 100644 --- a/cpu/o3/lsq_impl.hh +++ b/cpu/o3/lsq_impl.hh @@ -33,7 +33,6 @@ using namespace std; template <class Impl> LSQ<Impl>::LSQ(Params *params) : LQEntries(params->LQEntries), SQEntries(params->SQEntries), - loads(0), stores(0), storesToWB(0), numThreads(params->numberOfThreads) { DPRINTF(LSQ, "Creating LSQ object.\n"); @@ -144,6 +143,24 @@ LSQ<Impl>::setPageTable(PageTable *pt_ptr) #endif template <class Impl> +void +LSQ<Impl>::switchOut() +{ + for (int tid = 0; tid < numThreads; tid++) { + thread[tid].switchOut(); + } +} + +template <class Impl> +void +LSQ<Impl>::takeOverFrom() +{ + for (int tid = 0; tid < numThreads; tid++) { + thread[tid].takeOverFrom(); + } +} + +template <class Impl> int LSQ<Impl>::entryAmount(int num_threads) { diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh index ba8b1d2e2..d17efe96a 100644 --- a/cpu/o3/lsq_unit.hh +++ b/cpu/o3/lsq_unit.hh @@ -38,6 +38,7 @@ #include "cpu/inst_seq.hh" #include "mem/mem_interface.hh" //#include "mem/page_table.hh" +#include "sim/debug.hh" #include "sim/sim_object.hh" #include "arch/faults.hh" @@ -110,6 +111,12 @@ class LSQUnit { /** Sets the page table pointer. */ // void setPageTable(PageTable *pt_ptr); + void switchOut(); + + void takeOverFrom(); + + bool isSwitchedOut() { return switchedOut; } + /** Ticks the LSQ unit, which in this case only resets the number of * used cache ports. * @todo: Move the number of used ports up to the LSQ level so it can @@ -278,20 +285,20 @@ class LSQUnit { /** Whether or not the store is completed. */ bool completed; }; - +/* enum Status { Running, Idle, DcacheMissStall, DcacheMissSwitch }; - +*/ private: /** The LSQUnit thread id. */ unsigned lsqID; /** The status of the LSQ unit. */ - Status _status; +// Status _status; /** The store queue. */ std::vector<SQEntry> storeQueue; @@ -335,6 +342,8 @@ class LSQUnit { /** The number of used cache ports in this cycle. */ int usedPorts; + bool switchedOut; + //list<InstSeqNum> mshrSeqNums; //Stats::Scalar<> dcacheStallCycles; @@ -373,7 +382,25 @@ class LSQUnit { // Will also need how many read/write ports the Dcache has. Or keep track // of that in stage that is one level up, and only call executeLoad/Store // the appropriate number of times. +/* + // total number of loads forwaded from LSQ stores + Stats::Vector<> lsq_forw_loads; + + // total number of loads ignored due to invalid addresses + Stats::Vector<> inv_addr_loads; + + // total number of software prefetches ignored due to invalid addresses + Stats::Vector<> inv_addr_swpfs; + + // total non-speculative bogus addresses seen (debug var) + Counter sim_invalid_addrs; + Stats::Vector<> fu_busy; //cumulative fu busy + // ready loads blocked due to memory disambiguation + Stats::Vector<> lsq_blocked_loads; + + Stats::Scalar<> lsqInversion; +*/ public: /** Executes the load at the given index. */ template <class T> @@ -590,7 +617,12 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx) } DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n", loadQueue[load_idx]->readPC()); - +/* + Addr debug_addr = ULL(0xfffffc0000be81a8); + if (req->vaddr == debug_addr) { + debug_break(); + } +*/ assert(!req->completionEvent); req->completionEvent = new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage); @@ -608,7 +640,7 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx) lastDcacheStall = curTick; - _status = DcacheMissStall; +// _status = DcacheMissStall; } else { DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n", @@ -694,7 +726,12 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx) storeQueue[store_idx].req = req; storeQueue[store_idx].size = sizeof(T); storeQueue[store_idx].data = data; - +/* + Addr debug_addr = ULL(0xfffffc0000be81a8); + if (req->vaddr == debug_addr) { + debug_break(); + } +*/ // This function only writes the data to the store queue, so no fault // can happen here. return NoFault; diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh index d9a118b0e..c5ce34c70 100644 --- a/cpu/o3/lsq_unit_impl.hh +++ b/cpu/o3/lsq_unit_impl.hh @@ -50,6 +50,9 @@ LSQUnit<Impl>::StoreCompletionEvent::process() //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum); + if (lsqPtr->isSwitchedOut()) + return; + lsqPtr->cpu->wakeCPU(); if (wbEvent) wbEvent->process(); @@ -78,6 +81,8 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries, { DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id); + switchedOut = false; + lsqID = id; LQEntries = maxLQEntries; @@ -140,6 +145,89 @@ LSQUnit<Impl>::setPageTable(PageTable *pt_ptr) template<class Impl> void +LSQUnit<Impl>::switchOut() +{ + switchedOut = true; + for (int i = 0; i < loadQueue.size(); ++i) + loadQueue[i] = NULL; + + while (storesToWB > 0 && + storeWBIdx != storeTail && + storeQueue[storeWBIdx].inst && + storeQueue[storeWBIdx].canWB) { + + if (storeQueue[storeWBIdx].size == 0 || + storeQueue[storeWBIdx].inst->isDataPrefetch() || + storeQueue[storeWBIdx].committed || + storeQueue[storeWBIdx].req->flags & LOCKED) { + incrStIdx(storeWBIdx); + + continue; + } + + assert(storeQueue[storeWBIdx].req); + assert(!storeQueue[storeWBIdx].committed); + + MemReqPtr req = storeQueue[storeWBIdx].req; + storeQueue[storeWBIdx].committed = true; + + req->cmd = Write; + req->completionEvent = NULL; + req->time = curTick; + assert(!req->data); + req->data = new uint8_t[64]; + memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size); + + DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x " + "to Addr:%#x, data:%#x [sn:%lli]\n", + storeWBIdx,storeQueue[storeWBIdx].inst->readPC(), + req->paddr, *(req->data), + storeQueue[storeWBIdx].inst->seqNum); + + switch(storeQueue[storeWBIdx].size) { + case 1: + cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data); + break; + case 2: + cpu->write(req, (uint16_t &)storeQueue[storeWBIdx].data); + break; + case 4: + cpu->write(req, (uint32_t &)storeQueue[storeWBIdx].data); + break; + case 8: + cpu->write(req, (uint64_t &)storeQueue[storeWBIdx].data); + break; + default: + panic("Unexpected store size!\n"); + } + incrStIdx(storeWBIdx); + } +} + +template<class Impl> +void +LSQUnit<Impl>::takeOverFrom() +{ + switchedOut = false; + loads = stores = storesToWB = 0; + + loadHead = loadTail = 0; + + storeHead = storeWBIdx = storeTail = 0; + + usedPorts = 0; + + loadFaultInst = storeFaultInst = memDepViolator = NULL; + + blockedLoadSeqNum = 0; + + stalled = false; + isLoadBlocked = false; + loadBlockedHandled = false; +} + +template<class Impl> +void LSQUnit<Impl>::resizeLQ(unsigned size) { assert( size >= LQEntries); @@ -647,7 +735,7 @@ LSQUnit<Impl>::writebackStores() lastDcacheStall = curTick; - _status = DcacheMissStall; +// _status = DcacheMissStall; //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum); diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh index 32ce9f768..141e0fdc4 100644 --- a/cpu/o3/mem_dep_unit.hh +++ b/cpu/o3/mem_dep_unit.hh @@ -84,6 +84,10 @@ class MemDepUnit { /** Registers statistics. */ void regStats(); + void switchOut(); + + void takeOverFrom(); + /** Sets the pointer to the IQ. */ void setIQ(InstructionQueue<Impl> *iq_ptr); diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh index 771a0505e..05a33685d 100644 --- a/cpu/o3/mem_dep_unit_impl.hh +++ b/cpu/o3/mem_dep_unit_impl.hh @@ -103,6 +103,26 @@ MemDepUnit<MemDepPred, Impl>::regStats() template <class MemDepPred, class Impl> void +MemDepUnit<MemDepPred, Impl>::switchOut() +{ + for (int i = 0; i < Impl::MaxThreads; ++i) { + instList[i].clear(); + } + instsToReplay.clear(); + memDepHash.clear(); +} + +template <class MemDepPred, class Impl> +void +MemDepUnit<MemDepPred, Impl>::takeOverFrom() +{ + loadBarrier = storeBarrier = false; + loadBarrierSN = storeBarrierSN = 0; + depPred.clear(); +} + +template <class MemDepPred, class Impl> +void MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr) { iqPtr = iq_ptr; diff --git a/cpu/o3/ras.cc b/cpu/o3/ras.cc index 5e7ef38ae..0b3ea4918 100644 --- a/cpu/o3/ras.cc +++ b/cpu/o3/ras.cc @@ -42,6 +42,15 @@ ReturnAddrStack::init(unsigned _numEntries) } void +ReturnAddrStack::reset() +{ + usedEntries = 0; + tos = 0; + for (int i = 0; i < numEntries; ++i) + addrStack[i] = 0; +} + +void ReturnAddrStack::push(const Addr &return_addr) { incrTos(); diff --git a/cpu/o3/ras.hh b/cpu/o3/ras.hh index 5aa4fc05f..27e7c2df4 100644 --- a/cpu/o3/ras.hh +++ b/cpu/o3/ras.hh @@ -47,6 +47,8 @@ class ReturnAddrStack */ void init(unsigned numEntries); + void reset(); + /** Returns the top address on the RAS. */ Addr top() { return addrStack[tos]; } diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh index c6f8f97aa..4c5c46356 100644 --- a/cpu/o3/rename.hh +++ b/cpu/o3/rename.hh @@ -153,6 +153,10 @@ class DefaultRename /** Sets pointer to the scoreboard. */ void setScoreboard(Scoreboard *_scoreboard); + void switchOut(); + + void takeOverFrom(); + /** Squashes all instructions in a thread. */ void squash(unsigned tid); @@ -448,6 +452,7 @@ class DefaultRename Stats::Scalar<> renameUndoneMaps; Stats::Scalar<> renamedSerializing; Stats::Scalar<> renamedTempSerializing; + Stats::Scalar<> renameSkidInsts; }; #endif // __CPU_O3_RENAME_HH__ diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh index e29211921..d41058deb 100644 --- a/cpu/o3/rename_impl.hh +++ b/cpu/o3/rename_impl.hh @@ -151,6 +151,11 @@ DefaultRename<Impl>::regStats() .desc("count of temporary serializing insts renamed") .flags(Stats::total) ; + renameSkidInsts + .name(name() + ".RENAME:skidInsts") + .desc("count of insts added to the skid buffer") + .flags(Stats::total) + ; } template <class Impl> @@ -213,8 +218,8 @@ DefaultRename<Impl>::initStage() // Clear these pointers so they are not accidentally used in // non-initialization code. - iew_ptr = NULL; - commit_ptr = NULL; +// iew_ptr = NULL; +// commit_ptr = NULL; } template<class Impl> @@ -255,6 +260,55 @@ DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard) template <class Impl> void +DefaultRename<Impl>::switchOut() +{ + for (int i = 0; i < numThreads; i++) { + typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin(); + + while (!historyBuffer[i].empty()) { + assert(hb_it != historyBuffer[i].end()); + + DPRINTF(Rename, "[tid:%u]: Removing history entry with sequence " + "number %i.\n", i, (*hb_it).instSeqNum); + + // Tell the rename map to set the architected register to the + // previous physical register that it was renamed to. + renameMap[i]->setEntry(hb_it->archReg, hb_it->prevPhysReg); + + // Put the renamed physical register back on the free list. + freeList->addReg(hb_it->newPhysReg); + + historyBuffer[i].erase(hb_it++); + } + insts[i].clear(); + skidBuffer[i].clear(); + } +} + +template <class Impl> +void +DefaultRename<Impl>::takeOverFrom() +{ + _status = Inactive; + initStage(); + + for (int i=0; i< numThreads; i++) { + renameStatus[i] = Idle; + + stalls[i].iew = false; + stalls[i].commit = false; + serializeInst[i] = NULL; + + instsInProgress[i] = 0; + + emptyROB[i] = true; + + serializeOnNextInst[i] = false; + } +} + +template <class Impl> +void DefaultRename<Impl>::squash(unsigned tid) { DPRINTF(Rename, "[tid:%u]: Squashing instructions.\n",tid); @@ -393,7 +447,7 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid) } else if (renameStatus[tid] == Unblocking) { renameInsts(tid); - ++renameUnblockCycles; +// ++renameUnblockCycles; if (validInsts()) { // Add the current inputs to the skid buffer so they can be @@ -564,6 +618,8 @@ DefaultRename<Impl>::renameInsts(unsigned tid) } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) { DPRINTF(Rename, "Serialize after instruction encountered.\n"); + renamedSerializing++; + inst->setSerializeHandled(); serializeAfter(insts_to_rename, tid); @@ -594,13 +650,12 @@ DefaultRename<Impl>::renameInsts(unsigned tid) // Increment which instruction we're on. ++toIEWIndex; - ++renameRenamedInsts; - // Decrement how many instructions are available. --insts_available; } instsInProgress[tid] += renamed_insts; + renameRenamedInsts += renamed_insts; // If we wrote to the time buffer, record this. if (toIEWIndex) { @@ -635,6 +690,8 @@ DefaultRename<Impl>::skidInsert(unsigned tid) DPRINTF(Rename, "[tid:%u]: Inserting [sn:%lli] PC:%#x into Rename " "skidBuffer\n", tid, inst->seqNum, inst->readPC()); + ++renameSkidInsts; + skidBuffer[tid].push_back(inst); } diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh index 48199915f..0748850ea 100644 --- a/cpu/o3/rob.hh +++ b/cpu/o3/rob.hh @@ -97,6 +97,10 @@ class ROB */ void setActiveThreads(std::list<unsigned>* at_ptr); + void switchOut(); + + void takeOverFrom(); + /** Function to insert an instruction into the ROB. Note that whatever * calls this function must ensure that there is enough space within the * ROB for the new instruction. diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh index 96d907cda..02a4bfbee 100644 --- a/cpu/o3/rob_impl.hh +++ b/cpu/o3/rob_impl.hh @@ -121,6 +121,31 @@ ROB<Impl>::setActiveThreads(list<unsigned> *at_ptr) activeThreads = at_ptr; } +template <class Impl> +void +ROB<Impl>::switchOut() +{ + for (int tid = 0; tid < numThreads; tid++) { + instList[tid].clear(); + } +} + +template <class Impl> +void +ROB<Impl>::takeOverFrom() +{ + for (int tid=0; tid < numThreads; tid++) { + doneSquashing[tid] = true; + threadEntries[tid] = 0; + squashIt[tid] = instList[tid].end(); + } + numInstsInROB = 0; + + // Initialize the "universal" ROB head & tail point to invalid + // pointers + head = instList[0].end(); + tail = instList[0].end(); +} template <class Impl> void diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc index a6e131483..b481b4ad2 100644 --- a/cpu/o3/sat_counter.cc +++ b/cpu/o3/sat_counter.cc @@ -30,17 +30,17 @@ #include "cpu/o3/sat_counter.hh" SatCounter::SatCounter() - : maxVal(0), counter(0) + : initialVal(0), counter(0) { } SatCounter::SatCounter(unsigned bits) - : maxVal((1 << bits) - 1), counter(0) + : initialVal(0), maxVal((1 << bits) - 1), counter(0) { } -SatCounter::SatCounter(unsigned bits, unsigned initial_val) - : maxVal((1 << bits) - 1), counter(initial_val) +SatCounter::SatCounter(unsigned bits, uint8_t initial_val) + : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val) { // Check to make sure initial value doesn't exceed the max counter value. if (initial_val > maxVal) { @@ -53,19 +53,3 @@ SatCounter::setBits(unsigned bits) { maxVal = (1 << bits) - 1; } - -void -SatCounter::increment() -{ - if (counter < maxVal) { - ++counter; - } -} - -void -SatCounter::decrement() -{ - if (counter > 0) { - --counter; - } -} diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh index 952f1f86d..1d20a8a8f 100644 --- a/cpu/o3/sat_counter.hh +++ b/cpu/o3/sat_counter.hh @@ -57,22 +57,34 @@ class SatCounter * @param bits How many bits the counter will have. * @param initial_val Starting value for each counter. */ - SatCounter(unsigned bits, unsigned initial_val); + SatCounter(unsigned bits, uint8_t initial_val); /** * Sets the number of bits. */ void setBits(unsigned bits); + void reset() { counter = initialVal; } + /** * Increments the counter's current value. */ - void increment(); + void increment() + { + if (counter < maxVal) { + ++counter; + } + } /** * Decrements the counter's current value. */ - void decrement(); + void decrement() + { + if (counter > 0) { + --counter; + } + } /** * Read the counter's value. @@ -81,6 +93,7 @@ class SatCounter { return counter; } private: + uint8_t initialVal; uint8_t maxVal; uint8_t counter; }; diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh index 846f44176..17719bdeb 100644 --- a/cpu/o3/thread_state.hh +++ b/cpu/o3/thread_state.hh @@ -60,7 +60,7 @@ struct O3ThreadState : public ThreadState { { } #else O3ThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid) - : ThreadState(-1, _thread_num, NULL, _process, _asid), + : ThreadState(-1, _thread_num, _process->getMemory(), _process, _asid), cpu(_cpu), inSyscall(0), trapPending(0) { } |