diff options
Diffstat (limited to 'src/cpu')
40 files changed, 897 insertions, 306 deletions
diff --git a/src/cpu/base.cc b/src/cpu/base.cc index ce440aeff..f00dad7d6 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -48,6 +48,9 @@ #include "base/trace.hh" +// Hack +#include "sim/stat_control.hh" + using namespace std; vector<BaseCPU *> BaseCPU::cpuList; @@ -57,6 +60,30 @@ vector<BaseCPU *> BaseCPU::cpuList; // been initialized int maxThreadsPerCPU = 1; +void +CPUProgressEvent::process() +{ + Counter temp = cpu->totalInstructions(); +#ifndef NDEBUG + double ipc = double(temp - lastNumInst) / (interval / cpu->cycles(1)); + + DPRINTFN("%s progress event, instructions committed: %lli, IPC: %0.8d\n", + cpu->name(), temp - lastNumInst, ipc); + ipc = 0.0; +#else + cprintf("%lli: %s progress event, instructions committed: %lli\n", + curTick, cpu->name(), temp - lastNumInst); +#endif + lastNumInst = temp; + schedule(curTick + interval); +} + +const char * +CPUProgressEvent::description() +{ + return "CPU Progress event"; +} + #if FULL_SYSTEM BaseCPU::BaseCPU(Params *p) : MemObject(p->name), clock(p->clock), checkInterrupts(true), @@ -67,6 +94,7 @@ BaseCPU::BaseCPU(Params *p) number_of_threads(p->numberOfThreads), system(p->system) #endif { +// currentTick = curTick; DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this); // add self to global list of CPUs @@ -128,6 +156,12 @@ BaseCPU::BaseCPU(Params *p) p->max_loads_all_threads, *counter); } + if (p->stats_reset_inst != 0) { + Stats::SetupEvent(Stats::Reset, p->stats_reset_inst, 0, comInstEventQueue[0]); + cprintf("Stats reset event scheduled for %lli insts\n", + p->stats_reset_inst); + } + #if FULL_SYSTEM memset(interrupts, 0, sizeof(interrupts)); intstatus = 0; @@ -153,7 +187,6 @@ BaseCPU::BaseCPU(Params *p) if (params->profile) profileEvent = new ProfileEvent(this, params->profile); #endif - } BaseCPU::Params::Params() @@ -188,6 +221,11 @@ BaseCPU::startup() if (!params->deferRegistration && profileEvent) profileEvent->schedule(curTick); #endif + + if (params->progress_interval) { + new CPUProgressEvent(&mainEventQueue, params->progress_interval, + this); + } } @@ -238,7 +276,11 @@ BaseCPU::registerThreadContexts() void BaseCPU::switchOut() { - panic("This CPU doesn't support sampling!"); +// panic("This CPU doesn't support sampling!"); +#if FULL_SYSTEM + if (profileEvent && profileEvent->scheduled()) + profileEvent->deschedule(); +#endif } void @@ -261,18 +303,22 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU) assert(newTC->getProcessPtr() == oldTC->getProcessPtr()); newTC->getProcessPtr()->replaceThreadContext(newTC, newTC->readCpuId()); #endif + +// TheISA::compareXCs(oldXC, newXC); } #if FULL_SYSTEM for (int i = 0; i < TheISA::NumInterruptLevels; ++i) interrupts[i] = oldCPU->interrupts[i]; intstatus = oldCPU->intstatus; + checkInterrupts = oldCPU->checkInterrupts; for (int i = 0; i < threadContexts.size(); ++i) threadContexts[i]->profileClear(); - if (profileEvent) - profileEvent->schedule(curTick); + // The Sampler must take care of this! +// if (profileEvent) +// profileEvent->schedule(curTick); #endif } diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 2be6e4e81..2a3fd9b56 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -46,6 +46,23 @@ class ThreadContext; class System; class Port; +class CPUProgressEvent : public Event +{ + protected: + Tick interval; + Counter lastNumInst; + BaseCPU *cpu; + + public: + CPUProgressEvent(EventQueue *q, Tick ival, BaseCPU *_cpu) + : Event(q, Event::Stat_Event_Pri), interval(ival), lastNumInst(0), cpu(_cpu) + { schedule(curTick + interval); } + + void process(); + + virtual const char *description(); +}; + class BaseCPU : public MemObject { protected: @@ -53,6 +70,7 @@ class BaseCPU : public MemObject Tick clock; public: +// Tick currentTick; inline Tick frequency() const { return Clock::Frequency / clock; } inline Tick cycles(int numCycles) const { return clock * numCycles; } inline Tick curCycle() const { return curTick / clock; } @@ -120,6 +138,7 @@ class BaseCPU : public MemObject Counter max_insts_all_threads; Counter max_loads_any_thread; Counter max_loads_all_threads; + Counter stats_reset_inst; Tick clock; bool functionTrace; Tick functionTraceStart; @@ -128,6 +147,7 @@ class BaseCPU : public MemObject int cpu_id; Tick profile; #endif + Tick progress_interval; BaseCPU *checker; Params(); diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 3158aa9cf..926bfcbb2 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -197,7 +197,7 @@ class BaseDynInst : public FastAlloc, public RefCounted union Result { uint64_t integer; - float fp; +// float fp; double dbl; }; @@ -394,7 +394,7 @@ class BaseDynInst : public FastAlloc, public RefCounted uint64_t readIntResult() { return instResult.integer; } /** Returns the result of a floating point instruction. */ - float readFloatResult() { return instResult.fp; } + float readFloatResult() { return (float)instResult.dbl; } /** Returns the result of a floating point (double) instruction. */ double readDoubleResult() { return instResult.dbl; } @@ -419,7 +419,8 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Records an fp register being set to a value. */ void setFloatReg(const StaticInst *si, int idx, FloatReg val) { - instResult.fp = val; +// instResult.fp = val; + instResult.dbl = (double)val; } /** Records an fp register being set to an integer value. */ diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index 6d6ae1e0a..737b4b5d4 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -102,6 +102,7 @@ class CheckerCPU : public BaseCPU Process *process; #endif bool exitOnError; + bool updateOnError; bool warnOnlyOnLoadError; }; @@ -148,7 +149,7 @@ class CheckerCPU : public BaseCPU union Result { uint64_t integer; - float fp; +// float fp; double dbl; }; @@ -269,7 +270,7 @@ class CheckerCPU : public BaseCPU { int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag; thread->setFloatReg(reg_idx, val); - result.fp = val; + result.dbl = (double)val; } void setFloatRegBits(const StaticInst *si, int idx, FloatRegBits val, @@ -318,7 +319,7 @@ class CheckerCPU : public BaseCPU return thread->setMiscRegWithEffect(misc_reg, val); } - void recordPCChange(uint64_t val) { changedPC = true; } + void recordPCChange(uint64_t val) { changedPC = true; newPC = val; } void recordNextPCChange(uint64_t val) { changedNextPC = true; } bool translateInstReq(Request *req); @@ -360,6 +361,7 @@ class CheckerCPU : public BaseCPU uint64_t newPC; bool changedNextPC; bool exitOnError; + bool updateOnError; bool warnOnlyOnLoadError; InstSeqNum youngestSN; @@ -376,7 +378,7 @@ class Checker : public CheckerCPU { public: Checker(Params *p) - : CheckerCPU(p) + : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL) { } void switchOut(); @@ -393,12 +395,19 @@ class Checker : public CheckerCPU private: void handleError(DynInstPtr &inst) { - if (exitOnError) + if (exitOnError) { dumpAndExit(inst); + } else if (updateOnError) { + updateThisCycle = true; + } } void dumpAndExit(DynInstPtr &inst); + bool updateThisCycle; + + DynInstPtr unverifiedInst; + std::list<DynInstPtr> instList; typedef typename std::list<DynInstPtr>::iterator InstListIt; void dumpInsts(); diff --git a/src/cpu/checker/cpu_impl.hh b/src/cpu/checker/cpu_impl.hh index 81f97726c..3bb81c4b9 100644 --- a/src/cpu/checker/cpu_impl.hh +++ b/src/cpu/checker/cpu_impl.hh @@ -94,6 +94,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) } } + unverifiedInst = inst; + // Try to check all instructions that are completed, ending if we // run out of instructions to check or if an instruction is not // yet completed. @@ -171,7 +173,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) thread->setPC(thread->readNextPC()); thread->setNextPC(thread->readNextPC() + sizeof(MachInst)); - return; + break; } else { // The instruction is carrying an ITB fault. Handle // the fault and see if our results match the CPU on @@ -220,7 +222,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) thread->funcExeInst++; - fault = curStaticInst->execute(this, NULL); + if (!inst->isUnverifiable()) + fault = curStaticInst->execute(this, NULL); // Checks to make sure instrution results are correct. validateExecution(inst); @@ -289,6 +292,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) break; } } + unverifiedInst = NULL; } template <class DynInstPtr> @@ -395,6 +399,23 @@ template <class DynInstPtr> void Checker<DynInstPtr>::validateState() { + if (updateThisCycle) { + warn("%lli: Instruction PC %#x results didn't match up, copying all " + "registers from main CPU", curTick, unverifiedInst->readPC()); + // Heavy-weight copying of all registers + cpuXC->copyArchRegs(unverifiedInst->xcBase()); + // Also advance the PC. Hopefully no PC-based events happened. +#if THE_ISA != MIPS_ISA + // go to the next instruction + cpuXC->setPC(cpuXC->readNextPC()); + cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst)); +#else + // go to the next instruction + cpuXC->setPC(cpuXC->readNextPC()); + cpuXC->setNextPC(cpuXC->readNextNPC()); + cpuXC->setNextNPC(cpuXC->readNextNPC() + sizeof(MachInst)); +#endif + updateThisCycle = false; } template <class DynInstPtr> diff --git a/src/cpu/o3/alpha/cpu_builder.cc b/src/cpu/o3/alpha/cpu_builder.cc index 5e767655d..fbf1f342c 100644 --- a/src/cpu/o3/alpha/cpu_builder.cc +++ b/src/cpu/o3/alpha/cpu_builder.cc @@ -56,6 +56,7 @@ SimObjectParam<System *> system; Param<int> cpu_id; SimObjectParam<AlphaITB *> itb; SimObjectParam<AlphaDTB *> dtb; +Param<Tick> profile; #else SimObjectVectorParam<Process *> workload; #endif // FULL_SYSTEM @@ -68,6 +69,8 @@ Param<Counter> max_insts_any_thread; Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; +Param<Counter> stats_reset_inst; +Param<Tick> progress_interval; Param<unsigned> cachePorts; @@ -162,6 +165,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU) INIT_PARAM(cpu_id, "processor ID"), INIT_PARAM(itb, "Instruction translation buffer"), INIT_PARAM(dtb, "Data translation buffer"), + INIT_PARAM(profile, ""), #else INIT_PARAM(workload, "Processes to run"), #endif // FULL_SYSTEM @@ -184,6 +188,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU) "Terminate when all threads have reached this load" "count", 0), + INIT_PARAM_DFLT(stats_reset_inst, + "blah", + 0), + INIT_PARAM_DFLT(progress_interval, "Progress interval", 0), INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200), @@ -305,6 +313,7 @@ CREATE_SIM_OBJECT(DerivO3CPU) params->cpu_id = cpu_id; params->itb = itb; params->dtb = dtb; + params->profile = profile; #else params->workload = workload; #endif // FULL_SYSTEM @@ -317,6 +326,8 @@ CREATE_SIM_OBJECT(DerivO3CPU) params->max_insts_all_threads = max_insts_all_threads; params->max_loads_any_thread = max_loads_any_thread; params->max_loads_all_threads = max_loads_all_threads; + params->stats_reset_inst = stats_reset_inst; + params->progress_interval = progress_interval; // // Caches diff --git a/src/cpu/o3/checker_builder.cc b/src/cpu/o3/checker_builder.cc index 782d963b0..ad83ec57a 100644 --- a/src/cpu/o3/checker_builder.cc +++ b/src/cpu/o3/checker_builder.cc @@ -64,6 +64,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker) Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; + Param<Counter> stats_reset_inst; + Param<Tick> progress_interval; #if FULL_SYSTEM SimObjectParam<AlphaITB *> itb; @@ -78,6 +80,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker) Param<bool> defer_registration; Param<bool> exitOnError; + Param<bool> updateOnError; Param<bool> warnOnlyOnLoadError; Param<bool> function_trace; Param<Tick> function_trace_start; @@ -94,6 +97,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker) "terminate when any thread reaches this load count"), INIT_PARAM(max_loads_all_threads, "terminate when all threads have reached this load count"), + INIT_PARAM(stats_reset_inst, + "blah"), + INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0), #if FULL_SYSTEM INIT_PARAM(itb, "Instruction TLB"), @@ -109,6 +115,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker) INIT_PARAM(defer_registration, "defer system registration (for sampling)"), INIT_PARAM(exitOnError, "exit on error"), + INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"), INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load " "result errors", false), INIT_PARAM(function_trace, "Enable function trace"), @@ -126,7 +133,9 @@ CREATE_SIM_OBJECT(O3Checker) params->max_insts_all_threads = 0; params->max_loads_any_thread = 0; params->max_loads_all_threads = 0; + params->stats_reset_inst = 0; params->exitOnError = exitOnError; + params->updateOnError = updateOnError; params->warnOnlyOnLoadError = warnOnlyOnLoadError; params->deferRegistration = defer_registration; params->functionTrace = function_trace; @@ -139,6 +148,10 @@ CREATE_SIM_OBJECT(O3Checker) temp = max_insts_all_threads; temp = max_loads_any_thread; temp = max_loads_all_threads; + temp = stats_reset_inst; + Tick temp2 = progress_interval; + params->progress_interval = 0; + temp2++; #if FULL_SYSTEM params->itb = itb; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 34f487e2c..6ae01ae67 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -1083,12 +1083,26 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // Generate trap squash event. generateTrapEvent(tid); - +// warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC()); return false; } updateComInstStats(head_inst); +#if FULL_SYSTEM + if (thread[tid]->profile) { +// bool usermode = +// (cpu->readMiscReg(AlphaISA::IPR_DTB_CM, tid) & 0x18) != 0; +// thread[tid]->profilePC = usermode ? 1 : head_inst->readPC(); + thread[tid]->profilePC = head_inst->readPC(); + ProfileNode *node = thread[tid]->profile->consume(thread[tid]->getXCProxy(), + head_inst->staticInst); + + if (node) + thread[tid]->profileNode = node; + } +#endif + if (head_inst->traceData) { head_inst->traceData->setFetchSeq(head_inst->seqNum); head_inst->traceData->setCPSeq(thread[tid]->numInst); @@ -1102,6 +1116,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) head_inst->renamedDestRegIdx(i)); } + if (head_inst->isCopy()) + panic("Should not commit any copy instructions!"); + // Finally clear the head ROB entry. rob->retireHead(tid); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 19ab7f4c5..4279df6f7 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -33,6 +33,7 @@ #include "config/use_checker.hh" #if FULL_SYSTEM +#include "cpu/quiesce_event.hh" #include "sim/system.hh" #else #include "sim/process.hh" @@ -793,6 +794,8 @@ template <class Impl> unsigned int FullO3CPU<Impl>::drain(Event *drain_event) { + DPRINTF(O3CPU, "Switching out\n"); + BaseCPU::switchOut(_sampler); drainCount = 0; fetch.drain(); decode.drain(); @@ -863,6 +866,7 @@ FullO3CPU<Impl>::switchOut() { fetch.switchOut(); rename.switchOut(); + iew.switchOut(); commit.switchOut(); instList.clear(); while (!removeList.empty()) { @@ -931,6 +935,45 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU) } template <class Impl> +void +FullO3CPU<Impl>::serialize(std::ostream &os) +{ + BaseCPU::serialize(os); + nameOut(os, csprintf("%s.tickEvent", name())); + tickEvent.serialize(os); + + // Use SimpleThread's ability to checkpoint to make it easier to + // write out the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + for (int i = 0; i < thread.size(); i++) { + nameOut(os, csprintf("%s.xc.%i", name(), i)); + temp.copyXC(thread[i]->getXCProxy()); + temp.serialize(os); + } +} + +template <class Impl> +void +FullO3CPU<Impl>::unserialize(Checkpoint *cp, const std::string §ion) +{ + BaseCPU::unserialize(cp, section); + tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); + + // Use SimpleThread's ability to checkpoint to make it easier to + // read in the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + for (int i = 0; i < thread.size(); i++) { + temp.copyXC(thread[i]->getXCProxy()); + temp.unserialize(cp, csprintf("%s.xc.%i", section, i)); + thread[i]->getXCProxy()->copyArchRegs(temp.getProxy()); + } +} + +template <class Impl> uint64_t FullO3CPU<Impl>::readIntReg(int reg_idx) { diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 1e080181c..2d447bfe5 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -442,6 +442,7 @@ DefaultFetch<Impl>::takeOverFrom() wroteToTimeBuffer = false; _status = Inactive; switchedOut = false; + interruptPending = false; branchPred.takeOverFrom(); } @@ -563,7 +564,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid unsigned flags = 0; #endif // FULL_SYSTEM - if (cacheBlocked || (interruptPending && flags == 0)) { + if (cacheBlocked || isSwitchedOut() || (interruptPending && flags == 0)) { // Hold off fetch from getting new instructions when: // Cache is blocked, or // while an interrupt is pending and we're not in PAL mode, or @@ -1152,8 +1153,8 @@ DefaultFetch<Impl>::fetch(bool &status_change) fetch_PC = next_PC; if (instruction->isQuiesce()) { - warn("cycle %lli: Quiesce instruction encountered, halting fetch!", - curTick); +// warn("%lli: Quiesce instruction encountered, halting fetch!", +// curTick); fetchStatus[tid] = QuiescePending; ++numInst; status_change = true; @@ -1268,7 +1269,7 @@ DefaultFetch<Impl>::fetch(bool &status_change) fetchStatus[tid] = TrapPending; status_change = true; - warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]); +// warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]); #else // !FULL_SYSTEM warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]); #endif // FULL_SYSTEM diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 76fa008ee..a400c9fa8 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -216,6 +216,7 @@ class DefaultIEW if (++wbOutstanding == wbMax) ableToIssue = false; DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding); + assert(wbOutstanding <= wbMax); #ifdef DEBUG wbList.insert(sn); #endif @@ -226,6 +227,7 @@ class DefaultIEW if (wbOutstanding-- == wbMax) ableToIssue = true; DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding); + assert(wbOutstanding >= 0); #ifdef DEBUG assert(wbList.find(sn) != wbList.end()); wbList.erase(sn); @@ -450,7 +452,9 @@ class DefaultIEW unsigned wbCycle; /** Number of instructions in flight that will writeback. */ - unsigned wbOutstanding; + + /** Number of instructions in flight that will writeback. */ + int wbOutstanding; /** Writeback width. */ unsigned wbWidth; @@ -507,6 +511,8 @@ class DefaultIEW Stats::Scalar<> iewExecutedInsts; /** Stat for total number of executed load instructions. */ Stats::Vector<> iewExecLoadInsts; + /** Stat for total number of executed store instructions. */ +// Stats::Scalar<> iewExecStoreInsts; /** Stat for total number of squashed instructions skipped at execute. */ Stats::Scalar<> iewExecSquashedInsts; /** Number of executed software prefetches. */ diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index e9b24a6d4..c82f6dd21 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -162,17 +162,17 @@ DefaultIEW<Impl>::regStats() branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect; iewExecutedInsts - .name(name() + ".EXEC:insts") + .name(name() + ".iewExecutedInsts") .desc("Number of executed instructions"); iewExecLoadInsts .init(cpu->number_of_threads) - .name(name() + ".EXEC:loads") + .name(name() + ".iewExecLoadInsts") .desc("Number of load instructions executed") .flags(total); iewExecSquashedInsts - .name(name() + ".EXEC:squashedInsts") + .name(name() + ".iewExecSquashedInsts") .desc("Number of squashed instructions skipped in execute"); iewExecutedSwp @@ -372,6 +372,8 @@ DefaultIEW<Impl>::switchOut() { // Clear any state. switchedOut = true; + assert(insts[0].empty()); + assert(skidBuffer[0].empty()); instQueue.switchOut(); ldstQueue.switchOut(); @@ -410,7 +412,6 @@ DefaultIEW<Impl>::takeOverFrom() updateLSQNextCycle = false; - // @todo: Fix hardcoded number for (int i = 0; i < issueToExecQueue.getSize(); ++i) { issueToExecQueue.advance(); } @@ -611,9 +612,11 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst) wbNumInst = 0; } - assert((wbCycle * wbWidth + wbNumInst) < wbMax); + assert((wbCycle * wbWidth + wbNumInst) <= wbMax); } + DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n", + wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst); // Add finished instruction to queue to commit. (*iewQueue)[wbCycle].insts[wbNumInst] = inst; (*iewQueue)[wbCycle].size++; @@ -903,6 +906,22 @@ DefaultIEW<Impl>::emptyRenameInsts(unsigned tid) template <class Impl> void +DefaultIEW<Impl>::emptyRenameInsts(unsigned tid) +{ + while (!insts[tid].empty()) { + if (insts[tid].front()->isLoad() || + insts[tid].front()->isStore() ) { + toRename->iewInfo[tid].dispatchedToLSQ++; + } + + toRename->iewInfo[tid].dispatched++; + + insts[tid].pop(); + } +} + +template <class Impl> +void DefaultIEW<Impl>::wakeCPU() { cpu->wakeCPU(); @@ -1273,13 +1292,23 @@ DefaultIEW<Impl>::executeInsts() // event adds the instruction to the queue to commit fault = ldstQueue.executeLoad(inst); } else if (inst->isStore()) { - ldstQueue.executeStore(inst); + fault = ldstQueue.executeStore(inst); // If the store had a fault then it may not have a mem req - if (inst->req && !(inst->req->getFlags() & LOCKED)) { + if (!inst->isStoreConditional() && fault == NoFault) { + inst->setExecuted(); + + instToCommit(inst); + } else if (fault != NoFault) { + // If the instruction faulted, then we need to send it along to commit + // without the instruction completing. + + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. inst->setExecuted(); instToCommit(inst); + activityThisCycle(); } // Store conditionals will mark themselves as @@ -1404,7 +1433,7 @@ DefaultIEW<Impl>::writebackInsts() // E.g. Uncached loads have not actually executed when they // are first sent to commit. Instead commit must tell the LSQ // when it's ready to execute the uncached load. - if (!inst->isSquashed() && inst->isExecuted()) { + if (!inst->isSquashed() && inst->isExecuted() && inst->getFault() == NoFault) { int dependents = instQueue.wakeDependents(inst); for (int i = 0; i < inst->numDestRegs(); i++) { diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index d745faf7b..3dd4dc658 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -479,13 +479,13 @@ class InstructionQueue /** Distribution of number of instructions in the queue. * @todo: Need to create struct to track the entry time for each * instruction. */ - Stats::VectorDistribution<> queueResDist; +// Stats::VectorDistribution<> queueResDist; /** Distribution of the number of instructions issued. */ Stats::Distribution<> numIssuedDist; /** Distribution of the cycles it takes to issue an instruction. * @todo: Need to create struct to track the ready time for each * instruction. */ - Stats::VectorDistribution<> issueDelayDist; +// Stats::VectorDistribution<> issueDelayDist; /** Number of times an instruction could not be issued because a * FU was busy. diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index 47634f645..6edb528a9 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -230,7 +230,7 @@ InstructionQueue<Impl>::regStats() .name(name() + ".iqSquashedNonSpecRemoved") .desc("Number of squashed non-spec instructions that were removed") .prereq(iqSquashedNonSpecRemoved); - +/* queueResDist .init(Num_OpClasses, 0, 99, 2) .name(name() + ".IQ:residence:") @@ -240,6 +240,7 @@ InstructionQueue<Impl>::regStats() for (int i = 0; i < Num_OpClasses; ++i) { queueResDist.subname(i, opClassStrings[i]); } +*/ numIssuedDist .init(0,totalWidth,1) .name(name() + ".ISSUE:issued_per_cycle") @@ -268,7 +269,7 @@ InstructionQueue<Impl>::regStats() // // How long did instructions for a particular FU type wait prior to issue // - +/* issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") @@ -281,7 +282,7 @@ InstructionQueue<Impl>::regStats() subname << opClassStrings[i] << "_delay"; issueDelayDist.subname(i, subname.str()); } - +*/ issueRate .name(name() + ".ISSUE:rate") .desc("Inst issue rate") @@ -385,8 +386,16 @@ template <class Impl> void InstructionQueue<Impl>::switchOut() { +/* + if (!instList[0].empty() || (numEntries != freeEntries) || + !readyInsts[0].empty() || !nonSpecInsts.empty() || !listOrder.empty()) { + dumpInsts(); +// assert(0); + } +*/ resetState(); dependGraph.reset(); + instsToExecute.clear(); switchedOut = true; for (int i = 0; i < numThreads; ++i) { memDepUnit[i].switchOut(); @@ -642,9 +651,12 @@ template <class Impl> void InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx) { + DPRINTF(IQ, "Processing FU completion [sn:%lli]\n", inst->seqNum); // The CPU could have been sleeping until this op completed (*extremely* // long latency op). Wake it if it was. This may be overkill. if (isSwitchedOut()) { + DPRINTF(IQ, "FU completion not processed, IQ is switched out [sn:%lli]\n", + inst->seqNum); return; } @@ -1036,6 +1048,10 @@ InstructionQueue<Impl>::doSquash(unsigned tid) (squashed_inst->isMemRef() && !squashed_inst->memOpDone)) { + DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x " + "squashed.\n", + tid, squashed_inst->seqNum, squashed_inst->readPC()); + // Remove the instruction from the dependency list. if (!squashed_inst->isNonSpeculative() && !squashed_inst->isStoreConditional() && @@ -1066,7 +1082,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid) ++iqSquashedOperandsExamined; } - } else { + } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) { NonSpecMapIt ns_inst_it = nonSpecInsts.find(squashed_inst->seqNum); assert(ns_inst_it != nonSpecInsts.end()); @@ -1093,10 +1109,6 @@ InstructionQueue<Impl>::doSquash(unsigned tid) count[squashed_inst->threadNumber]--; ++freeEntries; - - DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x " - "squashed.\n", - tid, squashed_inst->seqNum, squashed_inst->readPC()); } instList[tid].erase(squash_it--); diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 2bbab71f0..a1ac5adb8 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -167,6 +167,16 @@ LSQ<Impl>::regStats() template<class Impl> void +LSQ<Impl>::regStats() +{ + //Initialize LSQs + for (int tid=0; tid < numThreads; tid++) { + thread[tid].regStats(); + } +} + +template<class Impl> +void LSQ<Impl>::setActiveThreads(std::list<unsigned> *at_ptr) { activeThreads = at_ptr; diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 1358a3699..8537e9dd7 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -407,20 +407,9 @@ class LSQUnit { // Will also need how many read/write ports the Dcache has. Or keep track // of that in stage that is one level up, and only call executeLoad/Store // the appropriate number of times. - /** Total number of loads forwaded from LSQ stores. */ Stats::Scalar<> lsqForwLoads; - /** Total number of loads ignored due to invalid addresses. */ - Stats::Scalar<> invAddrLoads; - - /** Total number of squashed loads. */ - Stats::Scalar<> lsqSquashedLoads; - - /** Total number of responses from the memory system that are - * ignored due to the instruction already being squashed. */ - Stats::Scalar<> lsqIgnoredResponses; - /** Total number of squashed stores. */ Stats::Scalar<> lsqSquashedStores; diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index fa716c712..2922b81bd 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -180,6 +180,10 @@ LSQUnit<Impl>::regStats() .name(name() + ".ignoredResponses") .desc("Number of memory responses ignored because the instruction is squashed"); + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); + lsqSquashedStores .name(name() + ".squashedStores") .desc("Number of stores squashed"); @@ -220,8 +224,10 @@ void LSQUnit<Impl>::switchOut() { switchedOut = true; - for (int i = 0; i < loadQueue.size(); ++i) + for (int i = 0; i < loadQueue.size(); ++i) { + assert(!loadQueue[i]); loadQueue[i] = NULL; + } assert(storesToWB == 0); } @@ -408,6 +414,11 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst) if (load_fault != NoFault) { // Send this instruction to commit, also make sure iew stage // realizes there is activity. + // Mark it as executed unless it is an uncached load that + // needs to hit the head of commit. + if (!(inst->req->flags & UNCACHEABLE) || inst->isAtCommit()) { + inst->setExecuted(); + } iewStage->instToCommit(inst); iewStage->activityThisCycle(); } @@ -467,6 +478,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = loadQueue[load_idx]; + ++lsqMemOrderViolation; return genMachineCheckFault(); } diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh index 16f67a4e0..c649ca385 100644 --- a/src/cpu/o3/mem_dep_unit_impl.hh +++ b/src/cpu/o3/mem_dep_unit_impl.hh @@ -109,6 +109,9 @@ template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::switchOut() { + assert(instList[0].empty()); + assert(instsToReplay.empty()); + assert(memDepHash.empty()); // Clear any state. for (int i = 0; i < Impl::MaxThreads; ++i) { instList[i].clear(); diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index ba26a01dd..177b9cb87 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -417,6 +417,8 @@ class DefaultRename /** The maximum skid buffer size. */ unsigned skidBufferMax; + PhysRegIndex maxPhysicalRegs; + /** Enum to record the source of a structure full stall. Can come from * either ROB, IQ, LSQ, and it is priortized in that order. */ diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 782c0fe5f..248d7deb6 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -41,7 +41,8 @@ DefaultRename<Impl>::DefaultRename(Params *params) commitToRenameDelay(params->commitToRenameDelay), renameWidth(params->renameWidth), commitWidth(params->commitWidth), - numThreads(params->numberOfThreads) + numThreads(params->numberOfThreads), + maxPhysicalRegs(params->numPhysIntRegs + params->numPhysFloatRegs) { _status = Inactive; @@ -286,6 +287,11 @@ DefaultRename<Impl>::switchOut() // Put the renamed physical register back on the free list. freeList->addReg(hb_it->newPhysReg); + // Be sure to mark its register as ready if it's a misc register. + if (hb_it->newPhysReg >= maxPhysicalRegs) { + scoreboard->setReg(hb_it->newPhysReg); + } + historyBuffer[i].erase(hb_it++); } insts[i].clear(); @@ -889,6 +895,11 @@ DefaultRename<Impl>::doSquash(const InstSeqNum &squashed_seq_num, unsigned tid) // Put the renamed physical register back on the free list. freeList->addReg(hb_it->newPhysReg); + // Be sure to mark its register as ready if it's a misc register. + if (hb_it->newPhysReg >= maxPhysicalRegs) { + scoreboard->setReg(hb_it->newPhysReg); + } + historyBuffer[tid].erase(hb_it++); ++renameUndoneMaps; diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh index b6f2e14c0..0247deb52 100644 --- a/src/cpu/o3/thread_state.hh +++ b/src/cpu/o3/thread_state.hh @@ -31,8 +31,11 @@ #ifndef __CPU_O3_THREAD_STATE_HH__ #define __CPU_O3_THREAD_STATE_HH__ +#include "base/callback.hh" +#include "base/output.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" +#include "sim/sim_exit.hh" class Event; class Process; @@ -75,8 +78,22 @@ struct O3ThreadState : public ThreadState { #if FULL_SYSTEM O3ThreadState(O3CPU *_cpu, int _thread_num) : ThreadState(-1, _thread_num), - inSyscall(0), trapPending(0) - { } + cpu(_cpu), inSyscall(0), trapPending(0) + { + if (cpu->params->profile) { + profile = new FunctionProfile(cpu->params->system->kernelSymtab); + Callback *cb = + new MakeCallback<O3ThreadState, + &O3ThreadState::dumpFuncProfile>(this); + registerExitCallback(cb); + } + + // let's fill with a dummy node for now so we don't get a segfault + // on the first cycle when there's no node available. + static ProfileNode dummyNode; + profileNode = &dummyNode; + profilePC = 3; + } #else O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process, int _asid, MemObject *mem) @@ -95,6 +112,14 @@ struct O3ThreadState : public ThreadState { /** Handles the syscall. */ void syscall(int64_t callnum) { process->syscall(callnum, tc); } #endif + +#if FULL_SYSTEM + void dumpFuncProfile() + { + std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name())); + profile->dump(xcProxy, *os); + } +#endif }; #endif // __CPU_O3_THREAD_STATE_HH__ diff --git a/src/cpu/o3/tournament_pred.cc b/src/cpu/o3/tournament_pred.cc index 7cf78dcb1..ffb941c77 100644 --- a/src/cpu/o3/tournament_pred.cc +++ b/src/cpu/o3/tournament_pred.cc @@ -62,6 +62,8 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, for (int i = 0; i < localPredictorSize; ++i) localCtrs[i].setBits(localCtrBits); + localPredictorMask = floorPow2(localPredictorSize) - 1; + if (!isPowerOf2(localHistoryTableSize)) { fatal("Invalid local history table size!\n"); } @@ -158,7 +160,7 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history) //Lookup in the local predictor to get its branch prediction local_history_idx = calcLocHistIdx(branch_addr); local_predictor_idx = localHistoryTable[local_history_idx] - & localHistoryMask; + & localPredictorMask; local_prediction = localCtrs[local_predictor_idx].read() > threshold; //Lookup in the global predictor to get its branch prediction @@ -176,7 +178,8 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history) bp_history = (void *)history; assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); + local_history_idx < localHistoryTableSize && + local_predictor_idx < localPredictorSize); // Commented code is for doing speculative update of counters and // all histories. @@ -234,7 +237,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history) // Get the local predictor's current prediction local_history_idx = calcLocHistIdx(branch_addr); local_predictor_hist = localHistoryTable[local_history_idx]; - local_predictor_idx = local_predictor_hist & localHistoryMask; + local_predictor_idx = local_predictor_hist & localPredictorMask; // Update the choice predictor to tell it which one was correct if // there was a prediction. @@ -256,6 +259,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history) } assert(globalHistory < globalPredictorSize && + local_history_idx < localHistoryTableSize && local_predictor_idx < localPredictorSize); // Update the counters and local history with the proper diff --git a/src/cpu/o3/tournament_pred.hh b/src/cpu/o3/tournament_pred.hh index 66b4aaae2..472944910 100644 --- a/src/cpu/o3/tournament_pred.hh +++ b/src/cpu/o3/tournament_pred.hh @@ -159,6 +159,9 @@ class TournamentBP /** Size of the local predictor. */ unsigned localPredictorSize; + /** Mask to get the proper index bits into the predictor. */ + unsigned localPredictorMask; + /** Number of bits of the local predictor's counters. */ unsigned localCtrBits; diff --git a/src/cpu/ozone/checker_builder.cc b/src/cpu/ozone/checker_builder.cc index c372e51d6..99ba3e308 100644 --- a/src/cpu/ozone/checker_builder.cc +++ b/src/cpu/ozone/checker_builder.cc @@ -65,6 +65,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; + Param<Counter> stats_reset_inst; + Param<Tick> progress_interval; #if FULL_SYSTEM SimObjectParam<AlphaITB *> itb; @@ -79,6 +81,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<bool> defer_registration; Param<bool> exitOnError; + Param<bool> updateOnError; Param<bool> warnOnlyOnLoadError; Param<bool> function_trace; Param<Tick> function_trace_start; @@ -95,6 +98,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) "terminate when any thread reaches this load count"), INIT_PARAM(max_loads_all_threads, "terminate when all threads have reached this load count"), + INIT_PARAM(stats_reset_inst, + "blah"), + INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0), #if FULL_SYSTEM INIT_PARAM(itb, "Instruction TLB"), @@ -110,6 +116,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) INIT_PARAM(defer_registration, "defer system registration (for sampling)"), INIT_PARAM(exitOnError, "exit on error"), + INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"), INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load " "result errors", false), INIT_PARAM(function_trace, "Enable function trace"), @@ -127,7 +134,9 @@ CREATE_SIM_OBJECT(OzoneChecker) params->max_insts_all_threads = 0; params->max_loads_any_thread = 0; params->max_loads_all_threads = 0; + params->stats_reset_inst = 0; params->exitOnError = exitOnError; + params->updateOnError = updateOnError; params->warnOnlyOnLoadError = warnOnlyOnLoadError; params->deferRegistration = defer_registration; params->functionTrace = function_trace; @@ -140,6 +149,10 @@ CREATE_SIM_OBJECT(OzoneChecker) temp = max_insts_all_threads; temp = max_loads_any_thread; temp = max_loads_all_threads; + temp = stats_reset_inst; + Tick temp2 = progress_interval; + temp2++; + params->progress_interval = 0; #if FULL_SYSTEM params->itb = itb; diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh index e411c12bd..ece68282f 100644 --- a/src/cpu/ozone/cpu.hh +++ b/src/cpu/ozone/cpu.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 The Regents of The University of Michigan + * Copyright (c) 2006 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -81,13 +81,13 @@ template <class> class Checker; /** - * Declaration of Out-of-Order CPU class. Basically it is a SimpleCPU with - * simple out-of-order capabilities added to it. It is still a 1 CPI machine - * (?), but is capable of handling cache misses. Basically it models having - * a ROB/IQ by only allowing a certain amount of instructions to execute while - * the cache miss is outstanding. + * Light weight out of order CPU model that approximates an out of + * order CPU. It is separated into a front end and a back end, with + * the template parameter Impl describing the classes used for each. + * The goal is to be able to specify through the Impl the class to use + * for the front end and back end, with different classes used to + * model different levels of detail. */ - template <class Impl> class OzoneCPU : public BaseCPU { @@ -273,6 +273,7 @@ class OzoneCPU : public BaseCPU typedef OzoneThreadState<Impl> ImplState; private: + // Committed thread state for the OzoneCPU. OzoneThreadState<Impl> thread; public: @@ -310,12 +311,6 @@ class OzoneCPU : public BaseCPU tickEvent.squash(); } - private: - Trace::InstRecord *traceData; - - template<typename T> - void trace_data(T data); - public: enum Status { Running, @@ -326,8 +321,6 @@ class OzoneCPU : public BaseCPU Status _status; public: - bool checkInterrupts; - void post_interrupt(int int_num, int index); void zero_fill_64(Addr addr) { @@ -379,6 +372,7 @@ class OzoneCPU : public BaseCPU FrontEnd *frontEnd; BackEnd *backEnd; + private: Status status() const { return _status; } void setStatus(Status new_status) { _status = new_status; } @@ -410,12 +404,11 @@ class OzoneCPU : public BaseCPU // number of idle cycles Stats::Average<> notIdleFraction; Stats::Formula idleFraction; - public: + public: virtual void serialize(std::ostream &os); virtual void unserialize(Checkpoint *cp, const std::string §ion); - #if FULL_SYSTEM /** Translates instruction requestion. */ Fault translateInstReq(RequestPtr &req, OzoneThreadState<Impl> *thread) @@ -582,12 +575,9 @@ class OzoneCPU : public BaseCPU Fault copy(Addr dest); - InstSeqNum globalSeqNum; - public: void squashFromTC(); - // @todo: This can be a useful debug function. Implement it. void dumpInsts() { frontEnd->dumpInsts(); } #if FULL_SYSTEM @@ -605,7 +595,6 @@ class OzoneCPU : public BaseCPU ThreadContext *tcBase() { return tc; } - bool decoupledFrontEnd; struct CommStruct { InstSeqNum doneSeqNum; InstSeqNum nonSpecSeqNum; @@ -614,8 +603,13 @@ class OzoneCPU : public BaseCPU bool stall; }; + + InstSeqNum globalSeqNum; + TimeBuffer<CommStruct> comm; + bool decoupledFrontEnd; + bool lockFlag; Stats::Scalar<> quiesceCycles; diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc index e239b7a94..e3e4ec433 100644 --- a/src/cpu/ozone/cpu_builder.cc +++ b/src/cpu/ozone/cpu_builder.cc @@ -63,6 +63,7 @@ SimObjectParam<System *> system; Param<int> cpu_id; SimObjectParam<AlphaITB *> itb; SimObjectParam<AlphaDTB *> dtb; +Param<Tick> profile; #else SimObjectVectorParam<Process *> workload; //SimObjectParam<PageTable *> page_table; @@ -76,16 +77,19 @@ Param<Counter> max_insts_any_thread; Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; +Param<Counter> stats_reset_inst; +Param<Tick> progress_interval; //SimObjectParam<BaseCache *> icache; //SimObjectParam<BaseCache *> dcache; Param<unsigned> cachePorts; Param<unsigned> width; +Param<unsigned> frontEndLatency; Param<unsigned> frontEndWidth; +Param<unsigned> backEndLatency; Param<unsigned> backEndWidth; Param<unsigned> backEndSquashLatency; -Param<unsigned> backEndLatency; Param<unsigned> maxInstBufferSize; Param<unsigned> numPhysicalRegs; Param<unsigned> maxOutstandingMemOps; @@ -140,6 +144,7 @@ Param<unsigned> RASSize; Param<unsigned> LQEntries; Param<unsigned> SQEntries; +Param<bool> lsqLimits; Param<unsigned> LFSTSize; Param<unsigned> SSITSize; @@ -181,6 +186,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(cpu_id, "processor ID"), INIT_PARAM(itb, "Instruction translation buffer"), INIT_PARAM(dtb, "Data translation buffer"), + INIT_PARAM(profile, ""), #else INIT_PARAM(workload, "Processes to run"), // INIT_PARAM(page_table, "Page table"), @@ -204,16 +210,21 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) "Terminate when all threads have reached this load" "count", 0), + INIT_PARAM_DFLT(stats_reset_inst, + "blah", + 0), + INIT_PARAM_DFLT(progress_interval, "Progress interval", 0), // INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL), // INIT_PARAM_DFLT(dcache, "L1 data cache", NULL), INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200), INIT_PARAM_DFLT(width, "Width", 1), + INIT_PARAM_DFLT(frontEndLatency, "Front end latency", 1), INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1), + INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(backEndWidth, "Back end width", 1), INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1), - INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16), INIT_PARAM(numPhysicalRegs, "Number of physical registers"), INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4), @@ -274,6 +285,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(LQEntries, "Number of load queue entries"), INIT_PARAM(SQEntries, "Number of store queue entries"), + INIT_PARAM_DFLT(lsqLimits, "LSQ size limits dispatch", true), INIT_PARAM(LFSTSize, "Last fetched store table size"), INIT_PARAM(SSITSize, "Store set ID table size"), @@ -336,6 +348,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->cpu_id = cpu_id; params->itb = itb; params->dtb = dtb; + params->profile = profile; #else params->workload = workload; // params->pTable = page_table; @@ -347,6 +360,8 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->max_insts_all_threads = max_insts_all_threads; params->max_loads_any_thread = max_loads_any_thread; params->max_loads_all_threads = max_loads_all_threads; + params->stats_reset_inst = stats_reset_inst; + params->progress_interval = progress_interval; // // Caches @@ -357,6 +372,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->width = width; params->frontEndWidth = frontEndWidth; + params->frontEndLatency = frontEndLatency; params->backEndWidth = backEndWidth; params->backEndSquashLatency = backEndSquashLatency; params->backEndLatency = backEndLatency; @@ -414,6 +430,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->LQEntries = LQEntries; params->SQEntries = SQEntries; + params->lsqLimits = lsqLimits; params->SSITSize = SSITSize; params->LFSTSize = LFSTSize; diff --git a/src/cpu/ozone/cpu_impl.hh b/src/cpu/ozone/cpu_impl.hh index 80f18434c..5c8b5001d 100644 --- a/src/cpu/ozone/cpu_impl.hh +++ b/src/cpu/ozone/cpu_impl.hh @@ -50,7 +50,6 @@ #include "arch/alpha/types.hh" #include "arch/vtophys.hh" #include "base/callback.hh" -//#include "base/remote_gdb.hh" #include "cpu/profile.hh" #include "kern/kernel_stats.hh" #include "sim/faults.hh" @@ -68,15 +67,6 @@ using namespace TheISA; template <class Impl> -template<typename T> -void -OzoneCPU<Impl>::trace_data(T data) { - if (traceData) { - traceData->setData(data); - } -} - -template <class Impl> OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w) : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w) { @@ -112,7 +102,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) _status = Idle; if (p->checker) { -#if USE_CHECKER + BaseCPU *temp_checker = p->checker; checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker); checker->setMemory(mem); @@ -126,6 +116,8 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) panic("Checker enabled but not compiled in!"); #endif } else { + // If checker is not being used, then the xcProxy points + // directly to the CPU's ExecContext. checker = NULL; thread.tc = &ozoneTC; tc = &ozoneTC; @@ -138,7 +130,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setStatus(ThreadContext::Suspended); #if FULL_SYSTEM - /***** All thread state stuff *****/ + // Setup thread state stuff. thread.cpu = this; thread.setTid(0); @@ -187,12 +179,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) frontEnd->setBackEnd(backEnd); backEnd->setFrontEnd(frontEnd); - decoupledFrontEnd = p->decoupledFrontEnd; - globalSeqNum = 1; +#if FULL_SYSTEM checkInterrupts = false; +#endif + + lockFlag = 0; + // Setup rename table, initializing all values to ready. for (int i = 0; i < TheISA::TotalNumRegs; ++i) { thread.renameTable[i] = new DynInst(this); thread.renameTable[i]->setResultReady(); @@ -233,8 +228,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setVirtPort(virt_port); #endif - lockFlag = 0; - DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n"); } @@ -247,6 +240,7 @@ template <class Impl> void OzoneCPU<Impl>::switchOut() { + BaseCPU::switchOut(_sampler); switchCount = 0; // Front end needs state from back end, so switch out the back end first. backEnd->switchOut(); @@ -257,6 +251,8 @@ template <class Impl> void OzoneCPU<Impl>::signalSwitched() { + // Only complete the switchout when both the front end and back + // end have signalled they are ready to switch. if (++switchCount == 2) { backEnd->doSwitchOut(); frontEnd->doSwitchOut(); @@ -266,6 +262,17 @@ OzoneCPU<Impl>::signalSwitched() #endif _status = SwitchedOut; +#ifndef NDEBUG + // Loop through all registers + for (int i = 0; i < AlphaISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i] == frontEnd->renameTable[i]); + + assert(thread.renameTable[i] == backEnd->renameTable[i]); + + DPRINTF(OzoneCPU, "Checking if register %i matches.\n", i); + } +#endif + if (tickEvent.scheduled()) tickEvent.squash(); } @@ -278,13 +285,25 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU) { BaseCPU::takeOverFrom(oldCPU); + thread.trapPending = false; + thread.inSyscall = false; + backEnd->takeOverFrom(); frontEnd->takeOverFrom(); + frontEnd->renameTable.copyFrom(thread.renameTable); + backEnd->renameTable.copyFrom(thread.renameTable); assert(!tickEvent.scheduled()); +#ifndef NDEBUG + // Check rename table. + for (int i = 0; i < TheISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i]->isResultReady()); + } +#endif + // @todo: Fix hardcoded number // Clear out any old information in time buffer. - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < 15; ++i) { comm.advance(); } @@ -316,6 +335,10 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay) notIdleFraction++; scheduleTickEvent(delay); _status = Running; +#if FULL_SYSTEM + if (thread.quiesceEvent && thread.quiesceEvent->scheduled()) + thread.quiesceEvent->deschedule(); +#endif thread.setStatus(ThreadContext::Active); frontEnd->wakeFromQuiesce(); } @@ -393,7 +416,7 @@ template <class Impl> void OzoneCPU<Impl>::resetStats() { - startNumInst = numInst; +// startNumInst = numInst; notIdleFraction = (_status != Idle); } @@ -441,6 +464,15 @@ OzoneCPU<Impl>::serialize(std::ostream &os) ozoneTC.serialize(os); nameOut(os, csprintf("%s.tickEvent", name())); tickEvent.serialize(os); + + // Use SimpleThread's ability to checkpoint to make it easier to + // write out the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + nameOut(os, csprintf("%s.xc.0", name())); + temp.copyXC(thread.getXCProxy()); + temp.serialize(os); } template <class Impl> @@ -451,6 +483,15 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string §ion) UNSERIALIZE_ENUM(_status); ozoneTC.unserialize(cp, csprintf("%s.tc", section)); tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); + + // Use SimpleThread's ability to checkpoint to make it easier to + // read in the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + temp.copyXC(thread.getXCProxy()); + temp.unserialize(cp, csprintf("%s.xc.0", section)); + thread.getXCProxy()->copyArchRegs(temp.getProxy()); } template <class Impl> @@ -810,7 +851,9 @@ OzoneCPU<Impl>::OzoneTC::halt() template <class Impl> void OzoneCPU<Impl>::OzoneTC::dumpFuncProfile() -{ } +{ + thread->dumpFuncProfile(); +} #endif template <class Impl> @@ -829,6 +872,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) copyArchRegs(old_context); setCpuId(old_context->readCpuId()); + thread->inst = old_context->getInst(); #if !FULL_SYSTEM setFuncExeInst(old_context->readFuncExeInst()); #else @@ -842,6 +886,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) thread->quiesceEvent->tc = this; } + // Copy kernel stats pointer from old context. thread->kernelStats = old_context->getKernelStats(); // storeCondFailures = 0; cpu->lockFlag = false; @@ -863,7 +908,11 @@ OzoneCPU<Impl>::OzoneTC::regStats(const std::string &name) template <class Impl> void OzoneCPU<Impl>::OzoneTC::serialize(std::ostream &os) -{ } +{ + // Once serialization is added, serialize the quiesce event and + // kernel stats. Will need to make sure there aren't multiple + // things that serialize them. +} template <class Impl> void @@ -896,16 +945,14 @@ template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileClear() { - if (thread->profile) - thread->profile->clear(); + thread->profileClear(); } template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileSample() { - if (thread->profile) - thread->profile->sample(thread->profileNode, thread->profilePC); + thread->profileSample(); } #endif @@ -916,7 +963,6 @@ OzoneCPU<Impl>::OzoneTC::getThreadNum() return thread->readTid(); } -// Also somewhat obnoxious. Really only used for the TLB fault. template <class Impl> TheISA::MachInst OzoneCPU<Impl>::OzoneTC::getInst() @@ -934,14 +980,20 @@ OzoneCPU<Impl>::OzoneTC::copyArchRegs(ThreadContext *tc) cpu->frontEnd->setPC(thread->PC); cpu->frontEnd->setNextPC(thread->nextPC); - for (int i = 0; i < TheISA::TotalNumRegs; ++i) { - if (i < TheISA::FP_Base_DepTag) { - thread->renameTable[i]->setIntResult(tc->readIntReg(i)); - } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) { - int fp_idx = i - TheISA::FP_Base_DepTag; - thread->renameTable[i]->setDoubleResult( - tc->readFloatReg(fp_idx, 64)); - } + // First loop through the integer registers. + for (int i = 0; i < TheISA::NumIntRegs; ++i) { +/* DPRINTF(OzoneCPU, "Copying over register %i, had data %lli, " + "now has data %lli.\n", + i, thread->renameTable[i]->readIntResult(), + tc->readIntReg(i)); +*/ + thread->renameTable[i]->setIntResult(tc->readIntReg(i)); + } + + // Then loop through the floating point registers. + for (int i = 0; i < TheISA::NumFloatRegs; ++i) { + int fp_idx = i + TheISA::FP_Base_DepTag; + thread->renameTable[fp_idx]->setIntResult(tc->readFloatRegBits(i)); } #if !FULL_SYSTEM diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh index 3ed3c4d18..5ffd3666e 100644 --- a/src/cpu/ozone/front_end.hh +++ b/src/cpu/ozone/front_end.hh @@ -34,6 +34,7 @@ #include <deque> #include "arch/utility.hh" +#include "base/timebuf.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/bpred_unit.hh" #include "cpu/ozone/rename_table.hh" @@ -246,15 +247,21 @@ class FrontEnd void dumpInsts(); private: + TimeBuffer<int> numInstsReady; + typedef typename std::deque<DynInstPtr> InstBuff; typedef typename InstBuff::iterator InstBuffIt; + InstBuff feBuffer; + InstBuff instBuffer; int instBufferSize; int maxInstBufferSize; + int latency; + int width; int freeRegs; diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh index 1b120460a..d34716de6 100644 --- a/src/cpu/ozone/front_end_impl.hh +++ b/src/cpu/ozone/front_end_impl.hh @@ -92,8 +92,10 @@ FrontEnd<Impl>::FrontEnd(Params *params) : branchPred(params), icachePort(this), mem(params->mem), + numInstsReady(params->frontEndLatency, 0), instBufferSize(0), maxInstBufferSize(params->maxInstBufferSize), + latency(params->frontEndLatency), width(params->frontEndWidth), freeRegs(params->numPhysicalRegs), numPhysRegs(params->numPhysicalRegs), @@ -326,6 +328,18 @@ FrontEnd<Impl>::tick() if (switchedOut) return; + for (int insts_to_queue = numInstsReady[-latency]; + !instBuffer.empty() && insts_to_queue; + --insts_to_queue) + { + DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n", + instBuffer.front()->seqNum); + feBuffer.push_back(instBuffer.front()); + instBuffer.pop_front(); + } + + numInstsReady.advance(); + // @todo: Maybe I want to just have direct communication... if (fromCommit->doneSeqNum) { branchPred.update(fromCommit->doneSeqNum, 0); @@ -339,8 +353,8 @@ FrontEnd<Impl>::tick() cacheBlkValid = true; status = Running; - if (barrierInst) - status = SerializeBlocked; +// if (barrierInst) +// status = SerializeBlocked; if (freeRegs <= 0) status = RenameBlocked; checkBE(); @@ -414,11 +428,12 @@ FrontEnd<Impl>::tick() // latency instBuffer.push_back(inst); ++instBufferSize; + numInstsReady[0]++; ++num_inst; #if FULL_SYSTEM if (inst->isQuiesce()) { - warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); +// warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); status = QuiescePending; break; } @@ -572,10 +587,10 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst) // Change status over to SerializeBlocked so that other stages know // what this is blocked on. - status = SerializeBlocked; +// status = SerializeBlocked; - barrierInst = inst; - return true; +// barrierInst = inst; +// return true; } else if ((inst->isStoreConditional() || inst->isSerializeAfter()) && !inst->isSerializeHandled()) { DPRINTF(FE, "Serialize after instruction encountered.\n"); @@ -620,6 +635,7 @@ FrontEnd<Impl>::handleFault(Fault &fault) instruction->fault = fault; instruction->setCanIssue(); instBuffer.push_back(instruction); + numInstsReady[0]++; ++instBufferSize; } @@ -649,6 +665,21 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, freeRegs+= inst->numDestRegs(); } + while (!feBuffer.empty() && + feBuffer.back()->seqNum > squash_num) { + DynInstPtr inst = feBuffer.back(); + + DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n", + inst->seqNum, inst->readPC()); + + inst->clearDependents(); + + feBuffer.pop_back(); + --instBufferSize; + + freeRegs+= inst->numDestRegs(); + } + // Copy over rename table from the back end. renameTable.copyFrom(backEnd->renameTable); @@ -666,12 +697,12 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, DPRINTF(FE, "Squashing outstanding Icache access.\n"); memReq = NULL; } - +/* if (status == SerializeBlocked) { assert(barrierInst->seqNum > squash_num); barrierInst = NULL; } - +*/ // Unless this squash originated from the front end, we're probably // in running mode now. // Actually might want to make this latency dependent. @@ -683,13 +714,22 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInst() { - if (instBufferSize == 0) { + if (feBuffer.empty()) { return NULL; } - DynInstPtr inst = instBuffer.front(); + DynInstPtr inst = feBuffer.front(); - instBuffer.pop_front(); + if (inst->isSerializeBefore() || inst->isIprAccess()) { + DPRINTF(FE, "Back end is getting a serialize before inst\n"); + if (!backEnd->robEmpty()) { + DPRINTF(FE, "Rob is not empty yet, not returning inst\n"); + return NULL; + } + inst->clearSerializeBefore(); + } + + feBuffer.pop_front(); --instBufferSize; @@ -784,11 +824,11 @@ FrontEnd<Impl>::updateStatus() } if (status == BEBlocked && !be_block) { - if (barrierInst) { - status = SerializeBlocked; - } else { +// if (barrierInst) { +// status = SerializeBlocked; +// } else { status = Running; - } +// } ret_val = true; } return ret_val; @@ -810,6 +850,7 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInstFromCacheline() { +/* if (status == SerializeComplete) { DynInstPtr inst = barrierInst; status = Running; @@ -817,7 +858,7 @@ FrontEnd<Impl>::getInstFromCacheline() inst->clearSerializeBefore(); return inst; } - +*/ InstSeqNum inst_seq; MachInst inst; // @todo: Fix this magic number used here to handle word offset (and @@ -932,6 +973,7 @@ FrontEnd<Impl>::doSwitchOut() squash(0, 0); instBuffer.clear(); instBufferSize = 0; + feBuffer.clear(); status = Idle; } diff --git a/src/cpu/ozone/inorder_back_end_impl.hh b/src/cpu/ozone/inorder_back_end_impl.hh index 701fc0ee9..16ebac163 100644 --- a/src/cpu/ozone/inorder_back_end_impl.hh +++ b/src/cpu/ozone/inorder_back_end_impl.hh @@ -284,7 +284,7 @@ InorderBackEnd<Impl>::executeInsts() } inst->setExecuted(); - inst->setCompleted(); + inst->setResultReady(); inst->setCanCommit(); instList.pop_front(); diff --git a/src/cpu/ozone/inst_queue_impl.hh b/src/cpu/ozone/inst_queue_impl.hh index f2d80e621..32a940241 100644 --- a/src/cpu/ozone/inst_queue_impl.hh +++ b/src/cpu/ozone/inst_queue_impl.hh @@ -850,13 +850,13 @@ template <class Impl> void InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst) { - OpClass op_class = ready_inst->opClass(); +// OpClass op_class = ready_inst->opClass(); readyInsts.push(ready_inst); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - ready_inst->readPC(), op_class, ready_inst->seqNum); + ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum); } /* template <class Impl> @@ -1177,11 +1177,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst) return; } - OpClass op_class = inst->opClass(); +// OpClass op_class = inst->opClass(); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - inst->readPC(), op_class, inst->seqNum); + inst->readPC(), inst->opClass(), inst->seqNum); readyInsts.push(inst); } diff --git a/src/cpu/ozone/lw_back_end.hh b/src/cpu/ozone/lw_back_end.hh index d836ceebd..49c6a1ae2 100644 --- a/src/cpu/ozone/lw_back_end.hh +++ b/src/cpu/ozone/lw_back_end.hh @@ -80,7 +80,7 @@ class LWBackEnd TimeBuffer<IssueToExec> i2e; typename TimeBuffer<IssueToExec>::wire instsToExecute; TimeBuffer<ExecToCommit> e2c; - TimeBuffer<Writeback> numInstsToWB; + TimeBuffer<int> numInstsToWB; TimeBuffer<CommStruct> *comm; typename TimeBuffer<CommStruct>::wire toIEW; @@ -139,7 +139,7 @@ class LWBackEnd Tick lastCommitCycle; - bool robEmpty() { return instList.empty(); } + bool robEmpty() { return numInsts == 0; } bool isFull() { return numInsts >= numROBEntries; } bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; } @@ -194,6 +194,7 @@ class LWBackEnd } void instToCommit(DynInstPtr &inst); + void readyInstsForCommit(); void switchOut(); void doSwitchOut(); @@ -255,12 +256,13 @@ class LWBackEnd RenameTable<Impl> renameTable; private: + int latency; + // General back end width. Used if the more specific isn't given. int width; // Dispatch width. int dispatchWidth; - int numDispatchEntries; int dispatchSize; int waitingInsts; @@ -285,6 +287,7 @@ class LWBackEnd int numROBEntries; int numInsts; + bool lsqLimits; std::set<InstSeqNum> waitingMemOps; typedef std::set<InstSeqNum>::iterator MemIt; @@ -295,9 +298,6 @@ class LWBackEnd InstSeqNum squashSeqNum; Addr squashNextPC; - Fault faultFromFetch; - bool fetchHasFault; - bool switchedOut; bool switchPending; @@ -321,8 +321,6 @@ class LWBackEnd std::list<DynInstPtr> replayList; std::list<DynInstPtr> writeback; - int latency; - int squashLatency; bool exactFullStall; @@ -331,37 +329,39 @@ class LWBackEnd /* Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; */ - Stats::Vector<> rob_cap_events; - Stats::Vector<> rob_cap_inst_count; - Stats::Vector<> iq_cap_events; - Stats::Vector<> iq_cap_inst_count; + Stats::Vector<> robCapEvents; + Stats::Vector<> robCapInstCount; + Stats::Vector<> iqCapEvents; + Stats::Vector<> iqCapInstCount; // total number of instructions executed - Stats::Vector<> exe_inst; - Stats::Vector<> exe_swp; - Stats::Vector<> exe_nop; - Stats::Vector<> exe_refs; - Stats::Vector<> exe_loads; - Stats::Vector<> exe_branches; + Stats::Vector<> exeInst; + Stats::Vector<> exeSwp; + Stats::Vector<> exeNop; + Stats::Vector<> exeRefs; + Stats::Vector<> exeLoads; + Stats::Vector<> exeBranches; - Stats::Vector<> issued_ops; + Stats::Vector<> issuedOps; // total number of loads forwaded from LSQ stores - Stats::Vector<> lsq_forw_loads; + Stats::Vector<> lsqForwLoads; // total number of loads ignored due to invalid addresses - Stats::Vector<> inv_addr_loads; + Stats::Vector<> invAddrLoads; // total number of software prefetches ignored due to invalid addresses - Stats::Vector<> inv_addr_swpfs; + Stats::Vector<> invAddrSwpfs; // ready loads blocked due to memory disambiguation - Stats::Vector<> lsq_blocked_loads; + Stats::Vector<> lsqBlockedLoads; Stats::Scalar<> lsqInversion; - Stats::Vector<> n_issued_dist; - Stats::VectorDistribution<> issue_delay_dist; + Stats::Vector<> nIssuedDist; +/* + Stats::VectorDistribution<> issueDelayDist; - Stats::VectorDistribution<> queue_res_dist; + Stats::VectorDistribution<> queueResDist; +*/ /* Stats::Vector<> stat_fu_busy; Stats::Vector2d<> stat_fuBusy; @@ -379,37 +379,37 @@ class LWBackEnd Stats::Formula commit_ipb; Stats::Formula lsq_inv_rate; */ - Stats::Vector<> writeback_count; - Stats::Vector<> producer_inst; - Stats::Vector<> consumer_inst; - Stats::Vector<> wb_penalized; + Stats::Vector<> writebackCount; + Stats::Vector<> producerInst; + Stats::Vector<> consumerInst; + Stats::Vector<> wbPenalized; - Stats::Formula wb_rate; - Stats::Formula wb_fanout; - Stats::Formula wb_penalized_rate; + Stats::Formula wbRate; + Stats::Formula wbFanout; + Stats::Formula wbPenalizedRate; // total number of instructions committed - Stats::Vector<> stat_com_inst; - Stats::Vector<> stat_com_swp; - Stats::Vector<> stat_com_refs; - Stats::Vector<> stat_com_loads; - Stats::Vector<> stat_com_membars; - Stats::Vector<> stat_com_branches; + Stats::Vector<> statComInst; + Stats::Vector<> statComSwp; + Stats::Vector<> statComRefs; + Stats::Vector<> statComLoads; + Stats::Vector<> statComMembars; + Stats::Vector<> statComBranches; - Stats::Distribution<> n_committed_dist; + Stats::Distribution<> nCommittedDist; - Stats::Scalar<> commit_eligible_samples; - Stats::Vector<> commit_eligible; + Stats::Scalar<> commitEligibleSamples; + Stats::Vector<> commitEligible; Stats::Vector<> squashedInsts; Stats::Vector<> ROBSquashedInsts; - Stats::Scalar<> ROB_fcount; - Stats::Formula ROB_full_rate; + Stats::Scalar<> ROBFcount; + Stats::Formula ROBFullRate; - Stats::Vector<> ROB_count; // cumulative ROB occupancy - Stats::Formula ROB_occ_rate; - Stats::VectorDistribution<> ROB_occ_dist; + Stats::Vector<> ROBCount; // cumulative ROB occupancy + Stats::Formula ROBOccRate; +// Stats::VectorDistribution<> ROBOccDist; public: void dumpInsts(); diff --git a/src/cpu/ozone/lw_back_end_impl.hh b/src/cpu/ozone/lw_back_end_impl.hh index a4f1d805e..f87a2bc57 100644 --- a/src/cpu/ozone/lw_back_end_impl.hh +++ b/src/cpu/ozone/lw_back_end_impl.hh @@ -141,13 +141,14 @@ LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst) template <class Impl> LWBackEnd<Impl>::LWBackEnd(Params *params) - : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5), + : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0), trapSquash(false), tcSquash(false), - width(params->backEndWidth), exactFullStall(true) + latency(params->backEndLatency), + width(params->backEndWidth), lsqLimits(params->lsqLimits), + exactFullStall(true) { numROBEntries = params->numROBEntries; numInsts = 0; - numDispatchEntries = 32; maxOutstandingMemOps = params->maxOutstandingMemOps; numWaitingMemOps = 0; waitingInsts = 0; @@ -184,78 +185,79 @@ void LWBackEnd<Impl>::regStats() { using namespace Stats; - rob_cap_events + LSQ.regStats(); + + robCapEvents .init(cpu->number_of_threads) .name(name() + ".ROB:cap_events") .desc("number of cycles where ROB cap was active") .flags(total) ; - rob_cap_inst_count + robCapInstCount .init(cpu->number_of_threads) .name(name() + ".ROB:cap_inst") .desc("number of instructions held up by ROB cap") .flags(total) ; - iq_cap_events + iqCapEvents .init(cpu->number_of_threads) .name(name() +".IQ:cap_events" ) .desc("number of cycles where IQ cap was active") .flags(total) ; - iq_cap_inst_count + iqCapInstCount .init(cpu->number_of_threads) .name(name() + ".IQ:cap_inst") .desc("number of instructions held up by IQ cap") .flags(total) ; - - exe_inst + exeInst .init(cpu->number_of_threads) .name(name() + ".ISSUE:count") .desc("number of insts issued") .flags(total) ; - exe_swp + exeSwp .init(cpu->number_of_threads) .name(name() + ".ISSUE:swp") .desc("number of swp insts issued") .flags(total) ; - exe_nop + exeNop .init(cpu->number_of_threads) .name(name() + ".ISSUE:nop") .desc("number of nop insts issued") .flags(total) ; - exe_refs + exeRefs .init(cpu->number_of_threads) .name(name() + ".ISSUE:refs") .desc("number of memory reference insts issued") .flags(total) ; - exe_loads + exeLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:loads") .desc("number of load insts issued") .flags(total) ; - exe_branches + exeBranches .init(cpu->number_of_threads) .name(name() + ".ISSUE:branches") .desc("Number of branches issued") .flags(total) ; - issued_ops + issuedOps .init(cpu->number_of_threads) .name(name() + ".ISSUE:op_count") .desc("number of insts issued") @@ -272,28 +274,28 @@ LWBackEnd<Impl>::regStats() // // Other stats // - lsq_forw_loads + lsqForwLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:forw_loads") .desc("number of loads forwarded via LSQ") .flags(total) ; - inv_addr_loads + invAddrLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_loads") .desc("number of invalid-address loads") .flags(total) ; - inv_addr_swpfs + invAddrSwpfs .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_swpfs") .desc("number of invalid-address SW prefetches") .flags(total) ; - lsq_blocked_loads + lsqBlockedLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:blocked_loads") .desc("number of ready loads not issued due to memory disambiguation") @@ -305,51 +307,52 @@ LWBackEnd<Impl>::regStats() .desc("Number of times LSQ instruction issued early") ; - n_issued_dist + nIssuedDist .init(issueWidth + 1) .name(name() + ".ISSUE:issued_per_cycle") .desc("Number of insts issued each cycle") .flags(total | pdf | dist) ; - issue_delay_dist +/* + issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") .desc("cycles from operands ready to issue") .flags(pdf | cdf) ; - queue_res_dist + queueResDist .init(Num_OpClasses, 0, 99, 2) .name(name() + ".IQ:residence:") .desc("cycles from dispatch to issue") .flags(total | pdf | cdf ) ; for (int i = 0; i < Num_OpClasses; ++i) { - queue_res_dist.subname(i, opClassStrings[i]); + queueResDist.subname(i, opClassStrings[i]); } - - writeback_count +*/ + writebackCount .init(cpu->number_of_threads) .name(name() + ".WB:count") .desc("cumulative count of insts written-back") .flags(total) ; - producer_inst + producerInst .init(cpu->number_of_threads) .name(name() + ".WB:producers") .desc("num instructions producing a value") .flags(total) ; - consumer_inst + consumerInst .init(cpu->number_of_threads) .name(name() + ".WB:consumers") .desc("num instructions consuming a value") .flags(total) ; - wb_penalized + wbPenalized .init(cpu->number_of_threads) .name(name() + ".WB:penalized") .desc("number of instrctions required to write to 'other' IQ") @@ -357,71 +360,71 @@ LWBackEnd<Impl>::regStats() ; - wb_penalized_rate + wbPenalizedRate .name(name() + ".WB:penalized_rate") .desc ("fraction of instructions written-back that wrote to 'other' IQ") .flags(total) ; - wb_penalized_rate = wb_penalized / writeback_count; + wbPenalizedRate = wbPenalized / writebackCount; - wb_fanout + wbFanout .name(name() + ".WB:fanout") .desc("average fanout of values written-back") .flags(total) ; - wb_fanout = producer_inst / consumer_inst; + wbFanout = producerInst / consumerInst; - wb_rate + wbRate .name(name() + ".WB:rate") .desc("insts written-back per cycle") .flags(total) ; - wb_rate = writeback_count / cpu->numCycles; + wbRate = writebackCount / cpu->numCycles; - stat_com_inst + statComInst .init(cpu->number_of_threads) .name(name() + ".COM:count") .desc("Number of instructions committed") .flags(total) ; - stat_com_swp + statComSwp .init(cpu->number_of_threads) .name(name() + ".COM:swp_count") .desc("Number of s/w prefetches committed") .flags(total) ; - stat_com_refs + statComRefs .init(cpu->number_of_threads) .name(name() + ".COM:refs") .desc("Number of memory references committed") .flags(total) ; - stat_com_loads + statComLoads .init(cpu->number_of_threads) .name(name() + ".COM:loads") .desc("Number of loads committed") .flags(total) ; - stat_com_membars + statComMembars .init(cpu->number_of_threads) .name(name() + ".COM:membars") .desc("Number of memory barriers committed") .flags(total) ; - stat_com_branches + statComBranches .init(cpu->number_of_threads) .name(name() + ".COM:branches") .desc("Number of branches committed") .flags(total) ; - n_committed_dist + nCommittedDist .init(0,commitWidth,1) .name(name() + ".COM:committed_per_cycle") .desc("Number of insts commited each cycle") @@ -441,14 +444,14 @@ LWBackEnd<Impl>::regStats() // -> The standard deviation is computed only over cycles where // we reached the BW limit // - commit_eligible + commitEligible .init(cpu->number_of_threads) .name(name() + ".COM:bw_limited") .desc("number of insts not committed due to BW limits") .flags(total) ; - commit_eligible_samples + commitEligibleSamples .name(name() + ".COM:bw_lim_events") .desc("number cycles where commit BW limit reached") ; @@ -465,37 +468,38 @@ LWBackEnd<Impl>::regStats() .desc("Number of instructions removed from inst list when they reached the head of the ROB") ; - ROB_fcount + ROBFcount .name(name() + ".ROB:full_count") .desc("number of cycles where ROB was full") ; - ROB_count + ROBCount .init(cpu->number_of_threads) .name(name() + ".ROB:occupancy") .desc(name() + ".ROB occupancy (cumulative)") .flags(total) ; - ROB_full_rate + ROBFullRate .name(name() + ".ROB:full_rate") .desc("ROB full per cycle") ; - ROB_full_rate = ROB_fcount / cpu->numCycles; + ROBFullRate = ROBFcount / cpu->numCycles; - ROB_occ_rate + ROBOccRate .name(name() + ".ROB:occ_rate") .desc("ROB occupancy rate") .flags(total) ; - ROB_occ_rate = ROB_count / cpu->numCycles; - - ROB_occ_dist + ROBOccRate = ROBCount / cpu->numCycles; +/* + ROBOccDist .init(cpu->number_of_threads,0,numROBEntries,2) .name(name() + ".ROB:occ_dist") .desc("ROB Occupancy per cycle") .flags(total | cdf) ; +*/ } template <class Impl> @@ -588,17 +592,21 @@ LWBackEnd<Impl>::tick() { DPRINTF(BE, "Ticking back end\n"); + // Read in any done instruction information and update the IQ or LSQ. + updateStructures(); + if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) { cpu->signalSwitched(); return; } - ROB_count[0]+= numInsts; + readyInstsForCommit(); - wbCycle = 0; + numInstsToWB.advance(); - // Read in any done instruction information and update the IQ or LSQ. - updateStructures(); + ROBCount[0]+= numInsts; + + wbCycle = 0; #if FULL_SYSTEM checkInterrupts(); @@ -674,6 +682,10 @@ LWBackEnd<Impl>::dispatchInsts() while (numInsts < numROBEntries && numWaitingMemOps < maxOutstandingMemOps) { // Get instruction from front of time buffer + if (lsqLimits && LSQ.isFull()) { + break; + } + DynInstPtr inst = frontEnd->getInst(); if (!inst) { break; @@ -732,6 +744,7 @@ LWBackEnd<Impl>::dispatchInsts() inst->setIssued(); inst->setExecuted(); inst->setCanCommit(); + numInstsToWB[0]++; } else { DPRINTF(BE, "Instruction [sn:%lli] ready, addding to " "exeList.\n", @@ -866,8 +879,17 @@ LWBackEnd<Impl>::executeInsts() if (inst->isLoad()) { LSQ.executeLoad(inst); } else if (inst->isStore()) { - LSQ.executeStore(inst); - if (inst->req && !(inst->req->getFlags() & LOCKED)) { + Fault fault = LSQ.executeStore(inst); + + if (!inst->isStoreConditional() && fault == NoFault) { + inst->setExecuted(); + + instToCommit(inst); + } else if (fault != NoFault) { + // If the instruction faulted, then we need to send it along to commit + // without the instruction completing. + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. inst->setExecuted(); instToCommit(inst); @@ -908,36 +930,54 @@ LWBackEnd<Impl>::executeInsts() } } - issued_ops[0]+= num_executed; - n_issued_dist[num_executed]++; + issuedOps[0]+= num_executed; + nIssuedDist[num_executed]++; } template<class Impl> void LWBackEnd<Impl>::instToCommit(DynInstPtr &inst) { - DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n", inst->seqNum, inst->readPC()); if (!inst->isSquashed()) { - DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", - inst->seqNum, inst->readPC()); - - inst->setCanCommit(); - if (inst->isExecuted()) { inst->setResultReady(); int dependents = wakeDependents(inst); if (dependents) { - producer_inst[0]++; - consumer_inst[0]+= dependents; + producerInst[0]++; + consumerInst[0]+= dependents; } } } - writeback_count[0]++; + writeback.push_back(inst); + + numInstsToWB[0]++; + + writebackCount[0]++; +} + +template <class Impl> +void +LWBackEnd<Impl>::readyInstsForCommit() +{ + for (int i = numInstsToWB[-latency]; + !writeback.empty() && i; + --i) + { + DynInstPtr inst = writeback.front(); + writeback.pop_front(); + if (!inst->isSquashed()) { + DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", + inst->seqNum, inst->readPC()); + + inst->setCanCommit(); + } + } } + #if 0 template <class Impl> void @@ -1010,7 +1050,7 @@ LWBackEnd<Impl>::commitInst(int inst_num) // or store inst. Signal backwards that it should be executed. if (!inst->isExecuted()) { if (inst->isNonSpeculative() || - inst->isStoreConditional() || + (inst->isStoreConditional() && inst->getFault() == NoFault) || inst->isMemBarrier() || inst->isWriteBarrier()) { #if !FULL_SYSTEM @@ -1151,6 +1191,20 @@ LWBackEnd<Impl>::commitInst(int inst_num) ++freed_regs; } +#if FULL_SYSTEM + if (thread->profile) { +// bool usermode = +// (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0; +// thread->profilePC = usermode ? 1 : inst->readPC(); + thread->profilePC = inst->readPC(); + ProfileNode *node = thread->profile->consume(thread->getXCProxy(), + inst->staticInst); + + if (node) + thread->profileNode = node; + } +#endif + if (inst->traceData) { inst->traceData->setFetchSeq(inst->seqNum); inst->traceData->setCPSeq(thread->numInst); @@ -1158,6 +1212,9 @@ LWBackEnd<Impl>::commitInst(int inst_num) inst->traceData = NULL; } + if (inst->isCopy()) + panic("Should not commit any copy instructions!"); + inst->clearDependents(); frontEnd->addFreeRegs(freed_regs); @@ -1207,9 +1264,9 @@ LWBackEnd<Impl>::commitInsts() while (!instList.empty() && inst_num < commitWidth) { if (instList.back()->isSquashed()) { instList.back()->clearDependents(); + ROBSquashedInsts[instList.back()->threadNumber]++; instList.pop_back(); --numInsts; - ROBSquashedInsts[instList.back()->threadNumber]++; continue; } @@ -1221,7 +1278,7 @@ LWBackEnd<Impl>::commitInsts() break; } } - n_committed_dist.sample(inst_num); + nCommittedDist.sample(inst_num); } template <class Impl> @@ -1231,10 +1288,10 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) LSQ.squash(sn); int freed_regs = 0; - InstListIt waiting_list_end = waitingList.end(); + InstListIt insts_end_it = waitingList.end(); InstListIt insts_it = waitingList.begin(); - while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn) + while (insts_it != insts_end_it && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { ++insts_it; @@ -1260,6 +1317,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) while (!instList.empty() && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { + panic("Instruction should not be already squashed and on list!"); ++insts_it; continue; } @@ -1291,18 +1349,6 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) --numInsts; } - insts_it = waitingList.begin(); - while (!waitingList.empty() && insts_it != waitingList.end()) { - if ((*insts_it)->seqNum < sn) { - ++insts_it; - continue; - } - assert((*insts_it)->isSquashed()); - - waitingList.erase(insts_it++); - waitingInsts--; - } - while (memBarrier && memBarrier->seqNum > sn) { DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously " "squashed)\n", memBarrier->seqNum); @@ -1320,6 +1366,18 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) } } + insts_it = replayList.begin(); + insts_end_it = replayList.end(); + while (!replayList.empty() && insts_it != insts_end_it) { + if ((*insts_it)->seqNum < sn) { + ++insts_it; + continue; + } + assert((*insts_it)->isSquashed()); + + replayList.erase(insts_it++); + } + frontEnd->addFreeRegs(freed_regs); } @@ -1392,14 +1450,6 @@ LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst) template <class Impl> void -LWBackEnd<Impl>::fetchFault(Fault &fault) -{ - faultFromFetch = fault; - fetchHasFault = true; -} - -template <class Impl> -void LWBackEnd<Impl>::switchOut() { switchPending = true; @@ -1416,17 +1466,25 @@ LWBackEnd<Impl>::doSwitchOut() // yet written back. assert(robEmpty()); assert(!LSQ.hasStoresToWB()); - + writeback.clear(); + for (int i = 0; i < numInstsToWB.getSize() + 1; ++i) + numInstsToWB.advance(); + +// squash(0); + assert(waitingList.empty()); + assert(instList.empty()); + assert(replayList.empty()); + assert(writeback.empty()); LSQ.switchOut(); - - squash(0); } template <class Impl> void LWBackEnd<Impl>::takeOverFrom(ThreadContext *old_tc) { - switchedOut = false; + assert(!squashPending); + squashSeqNum = 0; + squashNextPC = 0; tcSquash = false; trapSquash = false; @@ -1451,27 +1509,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) - exe_swp[thread_number]++; + exeSwp[thread_number]++; else - exe_inst[thread_number]++; + exeInst[thread_number]++; #else - exe_inst[thread_number]++; + exeInst[thread_number]++; #endif // // Control operations // if (inst->isControl()) - exe_branches[thread_number]++; + exeBranches[thread_number]++; // // Memory operations // if (inst->isMemRef()) { - exe_refs[thread_number]++; + exeRefs[thread_number]++; if (inst->isLoad()) - exe_loads[thread_number]++; + exeLoads[thread_number]++; } } @@ -1491,33 +1549,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) { - stat_com_swp[tid]++; + statComSwp[tid]++; } else { - stat_com_inst[tid]++; + statComInst[tid]++; } #else - stat_com_inst[tid]++; + statComInst[tid]++; #endif // // Control Instructions // if (inst->isControl()) - stat_com_branches[tid]++; + statComBranches[tid]++; // // Memory references // if (inst->isMemRef()) { - stat_com_refs[tid]++; + statComRefs[tid]++; if (inst->isLoad()) { - stat_com_loads[tid]++; + statComLoads[tid]++; } } if (inst->isMemBarrier()) { - stat_com_membars[tid]++; + statComMembars[tid]++; } } @@ -1569,6 +1627,45 @@ LWBackEnd<Impl>::dumpInsts() ++num; } + inst_list_it = --(writeback.end()); + + cprintf("Writeback list size: %i\n", writeback.size()); + + while (inst_list_it != writeback.end()) + { + cprintf("Instruction:%i\n", + num); + if (!(*inst_list_it)->isSquashed()) { + if (!(*inst_list_it)->isIssued()) { + ++valid_num; + cprintf("Count:%i\n", valid_num); + } else if ((*inst_list_it)->isMemRef() && + !(*inst_list_it)->memOpDone) { + // Loads that have not been marked as executed still count + // towards the total instructions. + ++valid_num; + cprintf("Count:%i\n", valid_num); + } + } + + cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n" + "Issued:%i\nSquashed:%i\n", + (*inst_list_it)->readPC(), + (*inst_list_it)->seqNum, + (*inst_list_it)->threadNumber, + (*inst_list_it)->isIssued(), + (*inst_list_it)->isSquashed()); + + if ((*inst_list_it)->isMemRef()) { + cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone); + } + + cprintf("\n"); + + inst_list_it--; + ++num; + } + cprintf("Waiting list size: %i\n", waitingList.size()); inst_list_it = --(waitingList.end()); diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh index 9a21a9d01..6640a9f34 100644 --- a/src/cpu/ozone/lw_lsq.hh +++ b/src/cpu/ozone/lw_lsq.hh @@ -84,6 +84,8 @@ class OzoneLWLSQ { /** Returns the name of the LSQ unit. */ std::string name() const; + void regStats(); + /** Sets the CPU pointer. */ void setCPU(OzoneCPU *cpu_ptr); @@ -179,7 +181,7 @@ class OzoneLWLSQ { int numLoads() { return loads; } /** Returns the number of stores in the SQ. */ - int numStores() { return stores; } + int numStores() { return stores + storesInFlight; } /** Returns if either the LQ or SQ is full. */ bool isFull() { return lqFull() || sqFull(); } @@ -188,7 +190,7 @@ class OzoneLWLSQ { bool lqFull() { return loads >= (LQEntries - 1); } /** Returns if the SQ is full. */ - bool sqFull() { return stores >= (SQEntries - 1); } + bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); } /** Debugging function to dump instructions in the LSQ. */ void dumpInsts(); @@ -223,7 +225,9 @@ class OzoneLWLSQ { void storePostSend(Packet *pkt, DynInstPtr &inst); /** Completes the store at the specified index. */ - void completeStore(int store_idx); + void completeStore(DynInstPtr &inst); + + void removeStore(int store_idx); /** Handles doing the retry. */ void recvRetry(); @@ -394,6 +398,10 @@ class OzoneLWLSQ { int storesToWB; + public: + int storesInFlight; + + private: /// @todo Consider moving to a more advanced model with write vs read ports /** The number of cache ports available each cycle. */ int cachePorts; @@ -403,6 +411,9 @@ class OzoneLWLSQ { //list<InstSeqNum> mshrSeqNums; + /** Tota number of memory ordering violations. */ + Stats::Scalar<> lsqMemOrderViolation; + //Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; @@ -525,7 +536,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx) store_size = (*sq_it).size; - if (store_size == 0) { + if (store_size == 0 || (*sq_it).committed) { sq_it++; continue; } diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh index 7eef4b11f..31ffa9d67 100644 --- a/src/cpu/ozone/lw_lsq_impl.hh +++ b/src/cpu/ozone/lw_lsq_impl.hh @@ -132,7 +132,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt) template <class Impl> OzoneLWLSQ<Impl>::OzoneLWLSQ() : switchedOut(false), dcachePort(this), loads(0), stores(0), - storesToWB(0), stalled(false), isStoreBlocked(false), + storesToWB(0), storesInFlight(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), loadBlockedHandled(false) { } @@ -173,6 +173,11 @@ OzoneLWLSQ<Impl>::name() const template<class Impl> void +OzoneLWLSQ<Impl>::regStats() +{ + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); OzoneLWLSQ<Impl>::setCPU(OzoneCPU *cpu_ptr) { cpu = cpu_ptr; @@ -321,7 +326,7 @@ unsigned OzoneLWLSQ<Impl>::numFreeEntries() { unsigned free_lq_entries = LQEntries - loads; - unsigned free_sq_entries = SQEntries - stores; + unsigned free_sq_entries = SQEntries - (stores + storesInFlight); // Both the LQ and SQ entries have an extra dummy entry to differentiate // empty/full conditions. Subtract 1 from the free entries. @@ -385,6 +390,9 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst) // Actually probably want the oldest faulting load if (load_fault != NoFault) { DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum); + if (!(inst->req->flags & UNCACHEABLE && !inst->isAtCommit())) { + inst->setExecuted(); + } // Maybe just set it as can commit here, although that might cause // some other problems with sending traps to the ROB too quickly. be->instToCommit(inst); @@ -461,6 +469,7 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = (*lq_it); + ++lsqMemOrderViolation; return TheISA::genMachineCheckFault(); } @@ -553,8 +562,8 @@ OzoneLWLSQ<Impl>::writebackStores() if ((*sq_it).size == 0 && !(*sq_it).completed) { sq_it--; - completeStore(inst->sqIdx); - + removeStore(inst->sqIdx); + completeStore(inst); continue; } @@ -626,6 +635,8 @@ OzoneLWLSQ<Impl>::writebackStores() inst->sqIdx,inst->readPC(), req->paddr, *(req->data), inst->seqNum); + DPRINTF(OzoneLSQ, "StoresInFlight: %i\n", + storesInFlight + 1); if (dcacheInterface) { assert(!req->completionEvent); @@ -687,6 +698,8 @@ OzoneLWLSQ<Impl>::writebackStores() } sq_it--; } + ++storesInFlight; +// removeStore(inst->sqIdx); } else { panic("Must HAVE DCACHE!!!!!\n"); } @@ -704,7 +717,7 @@ void OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num) { DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!" - "(Loads:%i Stores:%i)\n",squashed_num,loads,stores); + "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight); LQIt lq_it = loadQueue.begin(); @@ -881,7 +894,7 @@ OzoneLWLSQ<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) template <class Impl> void -OzoneLWLSQ<Impl>::completeStore(int store_idx) +OzoneLWLSQ<Impl>::removeStore(int store_idx) { SQHashIt sq_hash_it = SQItHash.find(store_idx); assert(sq_hash_it != SQItHash.end()); @@ -891,8 +904,6 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) (*sq_it).completed = true; DynInstPtr inst = (*sq_it).inst; - --storesToWB; - if (isStalled() && inst->seqNum == stallingStoreIsn) { DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] " @@ -910,6 +921,13 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) SQItHash.erase(sq_hash_it); SQIndices.push(inst->sqIdx); storeQueue.erase(sq_it); +} + +template <class Impl> +void +OzoneLWLSQ<Impl>::completeStore(DynInstPtr &inst) +{ + --storesToWB; --stores; inst->setCompleted(); @@ -935,9 +953,14 @@ OzoneLWLSQ<Impl>::switchOut() switchedOut = true; // Clear the queue to free up resources + assert(stores == 0); + assert(storeQueue.empty()); + assert(loads == 0); + assert(loadQueue.empty()); + assert(storesInFlight == 0); storeQueue.clear(); loadQueue.clear(); - loads = stores = storesToWB = 0; + loads = stores = storesToWB = storesInFlight = 0; } template <class Impl> diff --git a/src/cpu/ozone/simple_params.hh b/src/cpu/ozone/simple_params.hh index 11cee716f..3f63d2e1d 100644 --- a/src/cpu/ozone/simple_params.hh +++ b/src/cpu/ozone/simple_params.hh @@ -71,10 +71,11 @@ class SimpleParams : public BaseCPU::Params unsigned cachePorts; unsigned width; + unsigned frontEndLatency; unsigned frontEndWidth; + unsigned backEndLatency; unsigned backEndWidth; unsigned backEndSquashLatency; - unsigned backEndLatency; unsigned maxInstBufferSize; unsigned numPhysicalRegs; unsigned maxOutstandingMemOps; @@ -150,6 +151,7 @@ class SimpleParams : public BaseCPU::Params // unsigned LQEntries; unsigned SQEntries; + bool lsqLimits; // // Memory dependence diff --git a/src/cpu/ozone/thread_state.hh b/src/cpu/ozone/thread_state.hh index 8234cf938..adaa8e71b 100644 --- a/src/cpu/ozone/thread_state.hh +++ b/src/cpu/ozone/thread_state.hh @@ -34,9 +34,12 @@ #include "arch/faults.hh" #include "arch/types.hh" #include "arch/regfile.hh" +#include "base/callback.hh" +#include "base/output.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" #include "sim/process.hh" +#include "sim/sim_exit.hh" class Event; //class Process; @@ -65,8 +68,21 @@ struct OzoneThreadState : public ThreadState { #if FULL_SYSTEM OzoneThreadState(CPUType *_cpu, int _thread_num) : ThreadState(-1, _thread_num), - intrflag(0), inSyscall(0), trapPending(0) + cpu(_cpu), intrflag(0), inSyscall(0), trapPending(0) { + if (cpu->params->profile) { + profile = new FunctionProfile(cpu->params->system->kernelSymtab); + Callback *cb = + new MakeCallback<OzoneThreadState, + &OzoneThreadState::dumpFuncProfile>(this); + registerExitCallback(cb); + } + + // let's fill with a dummy node for now so we don't get a segfault + // on the first cycle when there's no node available. + static ProfileNode dummyNode; + profileNode = &dummyNode; + profilePC = 3; miscRegFile.clear(); } #else @@ -130,6 +146,14 @@ struct OzoneThreadState : public ThreadState { void setNextPC(uint64_t val) { nextPC = val; } + +#if FULL_SYSTEM + void dumpFuncProfile() + { + std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name())); + profile->dump(xcProxy, *os); + } +#endif }; #endif // __CPU_OZONE_THREAD_STATE_HH__ diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index f801b93fa..522fe79aa 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -170,7 +170,7 @@ BaseSimpleCPU::regStats() void BaseSimpleCPU::resetStats() { - startNumInst = numInst; +// startNumInst = numInst; // notIdleFraction = (_status != Idle); } diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc index 5f86cf2b7..4fc47c982 100644 --- a/src/cpu/simple_thread.cc +++ b/src/cpu/simple_thread.cc @@ -162,6 +162,11 @@ SimpleThread::takeOverFrom(ThreadContext *oldContext) if (quiesceEvent) { quiesceEvent->tc = tc; } + + Kernel::Statistics *stats = oldContext->getKernelStats(); + if (stats) { + kernelStats = stats; + } #endif storeCondFailures = 0; diff --git a/src/cpu/thread_state.hh b/src/cpu/thread_state.hh index 6e985054f..5479f8478 100644 --- a/src/cpu/thread_state.hh +++ b/src/cpu/thread_state.hh @@ -32,6 +32,7 @@ #define __CPU_THREAD_STATE_HH__ #include "arch/types.hh" +#include "cpu/profile.hh" #include "cpu/thread_context.hh" #if !FULL_SYSTEM @@ -191,6 +192,21 @@ struct ThreadState { // simulation only; all functional memory accesses should use // one of the FunctionalMemory pointers above. short asid; + +#endif + +#if FULL_SYSTEM + void profileClear() + { + if (profile) + profile->clear(); + } + + void profileSample() + { + if (profile) + profile->sample(profileNode, profilePC); + } #endif /** Current instruction the thread is committing. Only set and |