diff options
Diffstat (limited to 'src/cpu/ozone')
-rw-r--r-- | src/cpu/ozone/checker_builder.cc | 13 | ||||
-rw-r--r-- | src/cpu/ozone/cpu.hh | 36 | ||||
-rw-r--r-- | src/cpu/ozone/cpu_builder.cc | 21 | ||||
-rw-r--r-- | src/cpu/ozone/cpu_impl.hh | 118 | ||||
-rw-r--r-- | src/cpu/ozone/front_end.hh | 7 | ||||
-rw-r--r-- | src/cpu/ozone/front_end_impl.hh | 74 | ||||
-rw-r--r-- | src/cpu/ozone/inorder_back_end_impl.hh | 2 | ||||
-rw-r--r-- | src/cpu/ozone/inst_queue_impl.hh | 8 | ||||
-rw-r--r-- | src/cpu/ozone/lw_back_end.hh | 94 | ||||
-rw-r--r-- | src/cpu/ozone/lw_back_end_impl.hh | 317 | ||||
-rw-r--r-- | src/cpu/ozone/lw_lsq.hh | 19 | ||||
-rw-r--r-- | src/cpu/ozone/lw_lsq_impl.hh | 41 | ||||
-rw-r--r-- | src/cpu/ozone/simple_params.hh | 4 | ||||
-rw-r--r-- | src/cpu/ozone/thread_state.hh | 26 |
14 files changed, 531 insertions, 249 deletions
diff --git a/src/cpu/ozone/checker_builder.cc b/src/cpu/ozone/checker_builder.cc index c372e51d6..99ba3e308 100644 --- a/src/cpu/ozone/checker_builder.cc +++ b/src/cpu/ozone/checker_builder.cc @@ -65,6 +65,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; + Param<Counter> stats_reset_inst; + Param<Tick> progress_interval; #if FULL_SYSTEM SimObjectParam<AlphaITB *> itb; @@ -79,6 +81,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<bool> defer_registration; Param<bool> exitOnError; + Param<bool> updateOnError; Param<bool> warnOnlyOnLoadError; Param<bool> function_trace; Param<Tick> function_trace_start; @@ -95,6 +98,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) "terminate when any thread reaches this load count"), INIT_PARAM(max_loads_all_threads, "terminate when all threads have reached this load count"), + INIT_PARAM(stats_reset_inst, + "blah"), + INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0), #if FULL_SYSTEM INIT_PARAM(itb, "Instruction TLB"), @@ -110,6 +116,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) INIT_PARAM(defer_registration, "defer system registration (for sampling)"), INIT_PARAM(exitOnError, "exit on error"), + INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"), INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load " "result errors", false), INIT_PARAM(function_trace, "Enable function trace"), @@ -127,7 +134,9 @@ CREATE_SIM_OBJECT(OzoneChecker) params->max_insts_all_threads = 0; params->max_loads_any_thread = 0; params->max_loads_all_threads = 0; + params->stats_reset_inst = 0; params->exitOnError = exitOnError; + params->updateOnError = updateOnError; params->warnOnlyOnLoadError = warnOnlyOnLoadError; params->deferRegistration = defer_registration; params->functionTrace = function_trace; @@ -140,6 +149,10 @@ CREATE_SIM_OBJECT(OzoneChecker) temp = max_insts_all_threads; temp = max_loads_any_thread; temp = max_loads_all_threads; + temp = stats_reset_inst; + Tick temp2 = progress_interval; + temp2++; + params->progress_interval = 0; #if FULL_SYSTEM params->itb = itb; diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh index e411c12bd..ece68282f 100644 --- a/src/cpu/ozone/cpu.hh +++ b/src/cpu/ozone/cpu.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 The Regents of The University of Michigan + * Copyright (c) 2006 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -81,13 +81,13 @@ template <class> class Checker; /** - * Declaration of Out-of-Order CPU class. Basically it is a SimpleCPU with - * simple out-of-order capabilities added to it. It is still a 1 CPI machine - * (?), but is capable of handling cache misses. Basically it models having - * a ROB/IQ by only allowing a certain amount of instructions to execute while - * the cache miss is outstanding. + * Light weight out of order CPU model that approximates an out of + * order CPU. It is separated into a front end and a back end, with + * the template parameter Impl describing the classes used for each. + * The goal is to be able to specify through the Impl the class to use + * for the front end and back end, with different classes used to + * model different levels of detail. */ - template <class Impl> class OzoneCPU : public BaseCPU { @@ -273,6 +273,7 @@ class OzoneCPU : public BaseCPU typedef OzoneThreadState<Impl> ImplState; private: + // Committed thread state for the OzoneCPU. OzoneThreadState<Impl> thread; public: @@ -310,12 +311,6 @@ class OzoneCPU : public BaseCPU tickEvent.squash(); } - private: - Trace::InstRecord *traceData; - - template<typename T> - void trace_data(T data); - public: enum Status { Running, @@ -326,8 +321,6 @@ class OzoneCPU : public BaseCPU Status _status; public: - bool checkInterrupts; - void post_interrupt(int int_num, int index); void zero_fill_64(Addr addr) { @@ -379,6 +372,7 @@ class OzoneCPU : public BaseCPU FrontEnd *frontEnd; BackEnd *backEnd; + private: Status status() const { return _status; } void setStatus(Status new_status) { _status = new_status; } @@ -410,12 +404,11 @@ class OzoneCPU : public BaseCPU // number of idle cycles Stats::Average<> notIdleFraction; Stats::Formula idleFraction; - public: + public: virtual void serialize(std::ostream &os); virtual void unserialize(Checkpoint *cp, const std::string §ion); - #if FULL_SYSTEM /** Translates instruction requestion. */ Fault translateInstReq(RequestPtr &req, OzoneThreadState<Impl> *thread) @@ -582,12 +575,9 @@ class OzoneCPU : public BaseCPU Fault copy(Addr dest); - InstSeqNum globalSeqNum; - public: void squashFromTC(); - // @todo: This can be a useful debug function. Implement it. void dumpInsts() { frontEnd->dumpInsts(); } #if FULL_SYSTEM @@ -605,7 +595,6 @@ class OzoneCPU : public BaseCPU ThreadContext *tcBase() { return tc; } - bool decoupledFrontEnd; struct CommStruct { InstSeqNum doneSeqNum; InstSeqNum nonSpecSeqNum; @@ -614,8 +603,13 @@ class OzoneCPU : public BaseCPU bool stall; }; + + InstSeqNum globalSeqNum; + TimeBuffer<CommStruct> comm; + bool decoupledFrontEnd; + bool lockFlag; Stats::Scalar<> quiesceCycles; diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc index e239b7a94..e3e4ec433 100644 --- a/src/cpu/ozone/cpu_builder.cc +++ b/src/cpu/ozone/cpu_builder.cc @@ -63,6 +63,7 @@ SimObjectParam<System *> system; Param<int> cpu_id; SimObjectParam<AlphaITB *> itb; SimObjectParam<AlphaDTB *> dtb; +Param<Tick> profile; #else SimObjectVectorParam<Process *> workload; //SimObjectParam<PageTable *> page_table; @@ -76,16 +77,19 @@ Param<Counter> max_insts_any_thread; Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; +Param<Counter> stats_reset_inst; +Param<Tick> progress_interval; //SimObjectParam<BaseCache *> icache; //SimObjectParam<BaseCache *> dcache; Param<unsigned> cachePorts; Param<unsigned> width; +Param<unsigned> frontEndLatency; Param<unsigned> frontEndWidth; +Param<unsigned> backEndLatency; Param<unsigned> backEndWidth; Param<unsigned> backEndSquashLatency; -Param<unsigned> backEndLatency; Param<unsigned> maxInstBufferSize; Param<unsigned> numPhysicalRegs; Param<unsigned> maxOutstandingMemOps; @@ -140,6 +144,7 @@ Param<unsigned> RASSize; Param<unsigned> LQEntries; Param<unsigned> SQEntries; +Param<bool> lsqLimits; Param<unsigned> LFSTSize; Param<unsigned> SSITSize; @@ -181,6 +186,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(cpu_id, "processor ID"), INIT_PARAM(itb, "Instruction translation buffer"), INIT_PARAM(dtb, "Data translation buffer"), + INIT_PARAM(profile, ""), #else INIT_PARAM(workload, "Processes to run"), // INIT_PARAM(page_table, "Page table"), @@ -204,16 +210,21 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) "Terminate when all threads have reached this load" "count", 0), + INIT_PARAM_DFLT(stats_reset_inst, + "blah", + 0), + INIT_PARAM_DFLT(progress_interval, "Progress interval", 0), // INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL), // INIT_PARAM_DFLT(dcache, "L1 data cache", NULL), INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200), INIT_PARAM_DFLT(width, "Width", 1), + INIT_PARAM_DFLT(frontEndLatency, "Front end latency", 1), INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1), + INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(backEndWidth, "Back end width", 1), INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1), - INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16), INIT_PARAM(numPhysicalRegs, "Number of physical registers"), INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4), @@ -274,6 +285,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(LQEntries, "Number of load queue entries"), INIT_PARAM(SQEntries, "Number of store queue entries"), + INIT_PARAM_DFLT(lsqLimits, "LSQ size limits dispatch", true), INIT_PARAM(LFSTSize, "Last fetched store table size"), INIT_PARAM(SSITSize, "Store set ID table size"), @@ -336,6 +348,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->cpu_id = cpu_id; params->itb = itb; params->dtb = dtb; + params->profile = profile; #else params->workload = workload; // params->pTable = page_table; @@ -347,6 +360,8 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->max_insts_all_threads = max_insts_all_threads; params->max_loads_any_thread = max_loads_any_thread; params->max_loads_all_threads = max_loads_all_threads; + params->stats_reset_inst = stats_reset_inst; + params->progress_interval = progress_interval; // // Caches @@ -357,6 +372,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->width = width; params->frontEndWidth = frontEndWidth; + params->frontEndLatency = frontEndLatency; params->backEndWidth = backEndWidth; params->backEndSquashLatency = backEndSquashLatency; params->backEndLatency = backEndLatency; @@ -414,6 +430,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->LQEntries = LQEntries; params->SQEntries = SQEntries; + params->lsqLimits = lsqLimits; params->SSITSize = SSITSize; params->LFSTSize = LFSTSize; diff --git a/src/cpu/ozone/cpu_impl.hh b/src/cpu/ozone/cpu_impl.hh index 80f18434c..5c8b5001d 100644 --- a/src/cpu/ozone/cpu_impl.hh +++ b/src/cpu/ozone/cpu_impl.hh @@ -50,7 +50,6 @@ #include "arch/alpha/types.hh" #include "arch/vtophys.hh" #include "base/callback.hh" -//#include "base/remote_gdb.hh" #include "cpu/profile.hh" #include "kern/kernel_stats.hh" #include "sim/faults.hh" @@ -68,15 +67,6 @@ using namespace TheISA; template <class Impl> -template<typename T> -void -OzoneCPU<Impl>::trace_data(T data) { - if (traceData) { - traceData->setData(data); - } -} - -template <class Impl> OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w) : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w) { @@ -112,7 +102,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) _status = Idle; if (p->checker) { -#if USE_CHECKER + BaseCPU *temp_checker = p->checker; checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker); checker->setMemory(mem); @@ -126,6 +116,8 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) panic("Checker enabled but not compiled in!"); #endif } else { + // If checker is not being used, then the xcProxy points + // directly to the CPU's ExecContext. checker = NULL; thread.tc = &ozoneTC; tc = &ozoneTC; @@ -138,7 +130,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setStatus(ThreadContext::Suspended); #if FULL_SYSTEM - /***** All thread state stuff *****/ + // Setup thread state stuff. thread.cpu = this; thread.setTid(0); @@ -187,12 +179,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) frontEnd->setBackEnd(backEnd); backEnd->setFrontEnd(frontEnd); - decoupledFrontEnd = p->decoupledFrontEnd; - globalSeqNum = 1; +#if FULL_SYSTEM checkInterrupts = false; +#endif + + lockFlag = 0; + // Setup rename table, initializing all values to ready. for (int i = 0; i < TheISA::TotalNumRegs; ++i) { thread.renameTable[i] = new DynInst(this); thread.renameTable[i]->setResultReady(); @@ -233,8 +228,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setVirtPort(virt_port); #endif - lockFlag = 0; - DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n"); } @@ -247,6 +240,7 @@ template <class Impl> void OzoneCPU<Impl>::switchOut() { + BaseCPU::switchOut(_sampler); switchCount = 0; // Front end needs state from back end, so switch out the back end first. backEnd->switchOut(); @@ -257,6 +251,8 @@ template <class Impl> void OzoneCPU<Impl>::signalSwitched() { + // Only complete the switchout when both the front end and back + // end have signalled they are ready to switch. if (++switchCount == 2) { backEnd->doSwitchOut(); frontEnd->doSwitchOut(); @@ -266,6 +262,17 @@ OzoneCPU<Impl>::signalSwitched() #endif _status = SwitchedOut; +#ifndef NDEBUG + // Loop through all registers + for (int i = 0; i < AlphaISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i] == frontEnd->renameTable[i]); + + assert(thread.renameTable[i] == backEnd->renameTable[i]); + + DPRINTF(OzoneCPU, "Checking if register %i matches.\n", i); + } +#endif + if (tickEvent.scheduled()) tickEvent.squash(); } @@ -278,13 +285,25 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU) { BaseCPU::takeOverFrom(oldCPU); + thread.trapPending = false; + thread.inSyscall = false; + backEnd->takeOverFrom(); frontEnd->takeOverFrom(); + frontEnd->renameTable.copyFrom(thread.renameTable); + backEnd->renameTable.copyFrom(thread.renameTable); assert(!tickEvent.scheduled()); +#ifndef NDEBUG + // Check rename table. + for (int i = 0; i < TheISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i]->isResultReady()); + } +#endif + // @todo: Fix hardcoded number // Clear out any old information in time buffer. - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < 15; ++i) { comm.advance(); } @@ -316,6 +335,10 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay) notIdleFraction++; scheduleTickEvent(delay); _status = Running; +#if FULL_SYSTEM + if (thread.quiesceEvent && thread.quiesceEvent->scheduled()) + thread.quiesceEvent->deschedule(); +#endif thread.setStatus(ThreadContext::Active); frontEnd->wakeFromQuiesce(); } @@ -393,7 +416,7 @@ template <class Impl> void OzoneCPU<Impl>::resetStats() { - startNumInst = numInst; +// startNumInst = numInst; notIdleFraction = (_status != Idle); } @@ -441,6 +464,15 @@ OzoneCPU<Impl>::serialize(std::ostream &os) ozoneTC.serialize(os); nameOut(os, csprintf("%s.tickEvent", name())); tickEvent.serialize(os); + + // Use SimpleThread's ability to checkpoint to make it easier to + // write out the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + nameOut(os, csprintf("%s.xc.0", name())); + temp.copyXC(thread.getXCProxy()); + temp.serialize(os); } template <class Impl> @@ -451,6 +483,15 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string §ion) UNSERIALIZE_ENUM(_status); ozoneTC.unserialize(cp, csprintf("%s.tc", section)); tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); + + // Use SimpleThread's ability to checkpoint to make it easier to + // read in the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + temp.copyXC(thread.getXCProxy()); + temp.unserialize(cp, csprintf("%s.xc.0", section)); + thread.getXCProxy()->copyArchRegs(temp.getProxy()); } template <class Impl> @@ -810,7 +851,9 @@ OzoneCPU<Impl>::OzoneTC::halt() template <class Impl> void OzoneCPU<Impl>::OzoneTC::dumpFuncProfile() -{ } +{ + thread->dumpFuncProfile(); +} #endif template <class Impl> @@ -829,6 +872,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) copyArchRegs(old_context); setCpuId(old_context->readCpuId()); + thread->inst = old_context->getInst(); #if !FULL_SYSTEM setFuncExeInst(old_context->readFuncExeInst()); #else @@ -842,6 +886,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) thread->quiesceEvent->tc = this; } + // Copy kernel stats pointer from old context. thread->kernelStats = old_context->getKernelStats(); // storeCondFailures = 0; cpu->lockFlag = false; @@ -863,7 +908,11 @@ OzoneCPU<Impl>::OzoneTC::regStats(const std::string &name) template <class Impl> void OzoneCPU<Impl>::OzoneTC::serialize(std::ostream &os) -{ } +{ + // Once serialization is added, serialize the quiesce event and + // kernel stats. Will need to make sure there aren't multiple + // things that serialize them. +} template <class Impl> void @@ -896,16 +945,14 @@ template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileClear() { - if (thread->profile) - thread->profile->clear(); + thread->profileClear(); } template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileSample() { - if (thread->profile) - thread->profile->sample(thread->profileNode, thread->profilePC); + thread->profileSample(); } #endif @@ -916,7 +963,6 @@ OzoneCPU<Impl>::OzoneTC::getThreadNum() return thread->readTid(); } -// Also somewhat obnoxious. Really only used for the TLB fault. template <class Impl> TheISA::MachInst OzoneCPU<Impl>::OzoneTC::getInst() @@ -934,14 +980,20 @@ OzoneCPU<Impl>::OzoneTC::copyArchRegs(ThreadContext *tc) cpu->frontEnd->setPC(thread->PC); cpu->frontEnd->setNextPC(thread->nextPC); - for (int i = 0; i < TheISA::TotalNumRegs; ++i) { - if (i < TheISA::FP_Base_DepTag) { - thread->renameTable[i]->setIntResult(tc->readIntReg(i)); - } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) { - int fp_idx = i - TheISA::FP_Base_DepTag; - thread->renameTable[i]->setDoubleResult( - tc->readFloatReg(fp_idx, 64)); - } + // First loop through the integer registers. + for (int i = 0; i < TheISA::NumIntRegs; ++i) { +/* DPRINTF(OzoneCPU, "Copying over register %i, had data %lli, " + "now has data %lli.\n", + i, thread->renameTable[i]->readIntResult(), + tc->readIntReg(i)); +*/ + thread->renameTable[i]->setIntResult(tc->readIntReg(i)); + } + + // Then loop through the floating point registers. + for (int i = 0; i < TheISA::NumFloatRegs; ++i) { + int fp_idx = i + TheISA::FP_Base_DepTag; + thread->renameTable[fp_idx]->setIntResult(tc->readFloatRegBits(i)); } #if !FULL_SYSTEM diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh index 3ed3c4d18..5ffd3666e 100644 --- a/src/cpu/ozone/front_end.hh +++ b/src/cpu/ozone/front_end.hh @@ -34,6 +34,7 @@ #include <deque> #include "arch/utility.hh" +#include "base/timebuf.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/bpred_unit.hh" #include "cpu/ozone/rename_table.hh" @@ -246,15 +247,21 @@ class FrontEnd void dumpInsts(); private: + TimeBuffer<int> numInstsReady; + typedef typename std::deque<DynInstPtr> InstBuff; typedef typename InstBuff::iterator InstBuffIt; + InstBuff feBuffer; + InstBuff instBuffer; int instBufferSize; int maxInstBufferSize; + int latency; + int width; int freeRegs; diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh index 1b120460a..d34716de6 100644 --- a/src/cpu/ozone/front_end_impl.hh +++ b/src/cpu/ozone/front_end_impl.hh @@ -92,8 +92,10 @@ FrontEnd<Impl>::FrontEnd(Params *params) : branchPred(params), icachePort(this), mem(params->mem), + numInstsReady(params->frontEndLatency, 0), instBufferSize(0), maxInstBufferSize(params->maxInstBufferSize), + latency(params->frontEndLatency), width(params->frontEndWidth), freeRegs(params->numPhysicalRegs), numPhysRegs(params->numPhysicalRegs), @@ -326,6 +328,18 @@ FrontEnd<Impl>::tick() if (switchedOut) return; + for (int insts_to_queue = numInstsReady[-latency]; + !instBuffer.empty() && insts_to_queue; + --insts_to_queue) + { + DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n", + instBuffer.front()->seqNum); + feBuffer.push_back(instBuffer.front()); + instBuffer.pop_front(); + } + + numInstsReady.advance(); + // @todo: Maybe I want to just have direct communication... if (fromCommit->doneSeqNum) { branchPred.update(fromCommit->doneSeqNum, 0); @@ -339,8 +353,8 @@ FrontEnd<Impl>::tick() cacheBlkValid = true; status = Running; - if (barrierInst) - status = SerializeBlocked; +// if (barrierInst) +// status = SerializeBlocked; if (freeRegs <= 0) status = RenameBlocked; checkBE(); @@ -414,11 +428,12 @@ FrontEnd<Impl>::tick() // latency instBuffer.push_back(inst); ++instBufferSize; + numInstsReady[0]++; ++num_inst; #if FULL_SYSTEM if (inst->isQuiesce()) { - warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); +// warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); status = QuiescePending; break; } @@ -572,10 +587,10 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst) // Change status over to SerializeBlocked so that other stages know // what this is blocked on. - status = SerializeBlocked; +// status = SerializeBlocked; - barrierInst = inst; - return true; +// barrierInst = inst; +// return true; } else if ((inst->isStoreConditional() || inst->isSerializeAfter()) && !inst->isSerializeHandled()) { DPRINTF(FE, "Serialize after instruction encountered.\n"); @@ -620,6 +635,7 @@ FrontEnd<Impl>::handleFault(Fault &fault) instruction->fault = fault; instruction->setCanIssue(); instBuffer.push_back(instruction); + numInstsReady[0]++; ++instBufferSize; } @@ -649,6 +665,21 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, freeRegs+= inst->numDestRegs(); } + while (!feBuffer.empty() && + feBuffer.back()->seqNum > squash_num) { + DynInstPtr inst = feBuffer.back(); + + DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n", + inst->seqNum, inst->readPC()); + + inst->clearDependents(); + + feBuffer.pop_back(); + --instBufferSize; + + freeRegs+= inst->numDestRegs(); + } + // Copy over rename table from the back end. renameTable.copyFrom(backEnd->renameTable); @@ -666,12 +697,12 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, DPRINTF(FE, "Squashing outstanding Icache access.\n"); memReq = NULL; } - +/* if (status == SerializeBlocked) { assert(barrierInst->seqNum > squash_num); barrierInst = NULL; } - +*/ // Unless this squash originated from the front end, we're probably // in running mode now. // Actually might want to make this latency dependent. @@ -683,13 +714,22 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInst() { - if (instBufferSize == 0) { + if (feBuffer.empty()) { return NULL; } - DynInstPtr inst = instBuffer.front(); + DynInstPtr inst = feBuffer.front(); - instBuffer.pop_front(); + if (inst->isSerializeBefore() || inst->isIprAccess()) { + DPRINTF(FE, "Back end is getting a serialize before inst\n"); + if (!backEnd->robEmpty()) { + DPRINTF(FE, "Rob is not empty yet, not returning inst\n"); + return NULL; + } + inst->clearSerializeBefore(); + } + + feBuffer.pop_front(); --instBufferSize; @@ -784,11 +824,11 @@ FrontEnd<Impl>::updateStatus() } if (status == BEBlocked && !be_block) { - if (barrierInst) { - status = SerializeBlocked; - } else { +// if (barrierInst) { +// status = SerializeBlocked; +// } else { status = Running; - } +// } ret_val = true; } return ret_val; @@ -810,6 +850,7 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInstFromCacheline() { +/* if (status == SerializeComplete) { DynInstPtr inst = barrierInst; status = Running; @@ -817,7 +858,7 @@ FrontEnd<Impl>::getInstFromCacheline() inst->clearSerializeBefore(); return inst; } - +*/ InstSeqNum inst_seq; MachInst inst; // @todo: Fix this magic number used here to handle word offset (and @@ -932,6 +973,7 @@ FrontEnd<Impl>::doSwitchOut() squash(0, 0); instBuffer.clear(); instBufferSize = 0; + feBuffer.clear(); status = Idle; } diff --git a/src/cpu/ozone/inorder_back_end_impl.hh b/src/cpu/ozone/inorder_back_end_impl.hh index 701fc0ee9..16ebac163 100644 --- a/src/cpu/ozone/inorder_back_end_impl.hh +++ b/src/cpu/ozone/inorder_back_end_impl.hh @@ -284,7 +284,7 @@ InorderBackEnd<Impl>::executeInsts() } inst->setExecuted(); - inst->setCompleted(); + inst->setResultReady(); inst->setCanCommit(); instList.pop_front(); diff --git a/src/cpu/ozone/inst_queue_impl.hh b/src/cpu/ozone/inst_queue_impl.hh index f2d80e621..32a940241 100644 --- a/src/cpu/ozone/inst_queue_impl.hh +++ b/src/cpu/ozone/inst_queue_impl.hh @@ -850,13 +850,13 @@ template <class Impl> void InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst) { - OpClass op_class = ready_inst->opClass(); +// OpClass op_class = ready_inst->opClass(); readyInsts.push(ready_inst); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - ready_inst->readPC(), op_class, ready_inst->seqNum); + ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum); } /* template <class Impl> @@ -1177,11 +1177,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst) return; } - OpClass op_class = inst->opClass(); +// OpClass op_class = inst->opClass(); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - inst->readPC(), op_class, inst->seqNum); + inst->readPC(), inst->opClass(), inst->seqNum); readyInsts.push(inst); } diff --git a/src/cpu/ozone/lw_back_end.hh b/src/cpu/ozone/lw_back_end.hh index d836ceebd..49c6a1ae2 100644 --- a/src/cpu/ozone/lw_back_end.hh +++ b/src/cpu/ozone/lw_back_end.hh @@ -80,7 +80,7 @@ class LWBackEnd TimeBuffer<IssueToExec> i2e; typename TimeBuffer<IssueToExec>::wire instsToExecute; TimeBuffer<ExecToCommit> e2c; - TimeBuffer<Writeback> numInstsToWB; + TimeBuffer<int> numInstsToWB; TimeBuffer<CommStruct> *comm; typename TimeBuffer<CommStruct>::wire toIEW; @@ -139,7 +139,7 @@ class LWBackEnd Tick lastCommitCycle; - bool robEmpty() { return instList.empty(); } + bool robEmpty() { return numInsts == 0; } bool isFull() { return numInsts >= numROBEntries; } bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; } @@ -194,6 +194,7 @@ class LWBackEnd } void instToCommit(DynInstPtr &inst); + void readyInstsForCommit(); void switchOut(); void doSwitchOut(); @@ -255,12 +256,13 @@ class LWBackEnd RenameTable<Impl> renameTable; private: + int latency; + // General back end width. Used if the more specific isn't given. int width; // Dispatch width. int dispatchWidth; - int numDispatchEntries; int dispatchSize; int waitingInsts; @@ -285,6 +287,7 @@ class LWBackEnd int numROBEntries; int numInsts; + bool lsqLimits; std::set<InstSeqNum> waitingMemOps; typedef std::set<InstSeqNum>::iterator MemIt; @@ -295,9 +298,6 @@ class LWBackEnd InstSeqNum squashSeqNum; Addr squashNextPC; - Fault faultFromFetch; - bool fetchHasFault; - bool switchedOut; bool switchPending; @@ -321,8 +321,6 @@ class LWBackEnd std::list<DynInstPtr> replayList; std::list<DynInstPtr> writeback; - int latency; - int squashLatency; bool exactFullStall; @@ -331,37 +329,39 @@ class LWBackEnd /* Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; */ - Stats::Vector<> rob_cap_events; - Stats::Vector<> rob_cap_inst_count; - Stats::Vector<> iq_cap_events; - Stats::Vector<> iq_cap_inst_count; + Stats::Vector<> robCapEvents; + Stats::Vector<> robCapInstCount; + Stats::Vector<> iqCapEvents; + Stats::Vector<> iqCapInstCount; // total number of instructions executed - Stats::Vector<> exe_inst; - Stats::Vector<> exe_swp; - Stats::Vector<> exe_nop; - Stats::Vector<> exe_refs; - Stats::Vector<> exe_loads; - Stats::Vector<> exe_branches; + Stats::Vector<> exeInst; + Stats::Vector<> exeSwp; + Stats::Vector<> exeNop; + Stats::Vector<> exeRefs; + Stats::Vector<> exeLoads; + Stats::Vector<> exeBranches; - Stats::Vector<> issued_ops; + Stats::Vector<> issuedOps; // total number of loads forwaded from LSQ stores - Stats::Vector<> lsq_forw_loads; + Stats::Vector<> lsqForwLoads; // total number of loads ignored due to invalid addresses - Stats::Vector<> inv_addr_loads; + Stats::Vector<> invAddrLoads; // total number of software prefetches ignored due to invalid addresses - Stats::Vector<> inv_addr_swpfs; + Stats::Vector<> invAddrSwpfs; // ready loads blocked due to memory disambiguation - Stats::Vector<> lsq_blocked_loads; + Stats::Vector<> lsqBlockedLoads; Stats::Scalar<> lsqInversion; - Stats::Vector<> n_issued_dist; - Stats::VectorDistribution<> issue_delay_dist; + Stats::Vector<> nIssuedDist; +/* + Stats::VectorDistribution<> issueDelayDist; - Stats::VectorDistribution<> queue_res_dist; + Stats::VectorDistribution<> queueResDist; +*/ /* Stats::Vector<> stat_fu_busy; Stats::Vector2d<> stat_fuBusy; @@ -379,37 +379,37 @@ class LWBackEnd Stats::Formula commit_ipb; Stats::Formula lsq_inv_rate; */ - Stats::Vector<> writeback_count; - Stats::Vector<> producer_inst; - Stats::Vector<> consumer_inst; - Stats::Vector<> wb_penalized; + Stats::Vector<> writebackCount; + Stats::Vector<> producerInst; + Stats::Vector<> consumerInst; + Stats::Vector<> wbPenalized; - Stats::Formula wb_rate; - Stats::Formula wb_fanout; - Stats::Formula wb_penalized_rate; + Stats::Formula wbRate; + Stats::Formula wbFanout; + Stats::Formula wbPenalizedRate; // total number of instructions committed - Stats::Vector<> stat_com_inst; - Stats::Vector<> stat_com_swp; - Stats::Vector<> stat_com_refs; - Stats::Vector<> stat_com_loads; - Stats::Vector<> stat_com_membars; - Stats::Vector<> stat_com_branches; + Stats::Vector<> statComInst; + Stats::Vector<> statComSwp; + Stats::Vector<> statComRefs; + Stats::Vector<> statComLoads; + Stats::Vector<> statComMembars; + Stats::Vector<> statComBranches; - Stats::Distribution<> n_committed_dist; + Stats::Distribution<> nCommittedDist; - Stats::Scalar<> commit_eligible_samples; - Stats::Vector<> commit_eligible; + Stats::Scalar<> commitEligibleSamples; + Stats::Vector<> commitEligible; Stats::Vector<> squashedInsts; Stats::Vector<> ROBSquashedInsts; - Stats::Scalar<> ROB_fcount; - Stats::Formula ROB_full_rate; + Stats::Scalar<> ROBFcount; + Stats::Formula ROBFullRate; - Stats::Vector<> ROB_count; // cumulative ROB occupancy - Stats::Formula ROB_occ_rate; - Stats::VectorDistribution<> ROB_occ_dist; + Stats::Vector<> ROBCount; // cumulative ROB occupancy + Stats::Formula ROBOccRate; +// Stats::VectorDistribution<> ROBOccDist; public: void dumpInsts(); diff --git a/src/cpu/ozone/lw_back_end_impl.hh b/src/cpu/ozone/lw_back_end_impl.hh index a4f1d805e..f87a2bc57 100644 --- a/src/cpu/ozone/lw_back_end_impl.hh +++ b/src/cpu/ozone/lw_back_end_impl.hh @@ -141,13 +141,14 @@ LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst) template <class Impl> LWBackEnd<Impl>::LWBackEnd(Params *params) - : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5), + : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0), trapSquash(false), tcSquash(false), - width(params->backEndWidth), exactFullStall(true) + latency(params->backEndLatency), + width(params->backEndWidth), lsqLimits(params->lsqLimits), + exactFullStall(true) { numROBEntries = params->numROBEntries; numInsts = 0; - numDispatchEntries = 32; maxOutstandingMemOps = params->maxOutstandingMemOps; numWaitingMemOps = 0; waitingInsts = 0; @@ -184,78 +185,79 @@ void LWBackEnd<Impl>::regStats() { using namespace Stats; - rob_cap_events + LSQ.regStats(); + + robCapEvents .init(cpu->number_of_threads) .name(name() + ".ROB:cap_events") .desc("number of cycles where ROB cap was active") .flags(total) ; - rob_cap_inst_count + robCapInstCount .init(cpu->number_of_threads) .name(name() + ".ROB:cap_inst") .desc("number of instructions held up by ROB cap") .flags(total) ; - iq_cap_events + iqCapEvents .init(cpu->number_of_threads) .name(name() +".IQ:cap_events" ) .desc("number of cycles where IQ cap was active") .flags(total) ; - iq_cap_inst_count + iqCapInstCount .init(cpu->number_of_threads) .name(name() + ".IQ:cap_inst") .desc("number of instructions held up by IQ cap") .flags(total) ; - - exe_inst + exeInst .init(cpu->number_of_threads) .name(name() + ".ISSUE:count") .desc("number of insts issued") .flags(total) ; - exe_swp + exeSwp .init(cpu->number_of_threads) .name(name() + ".ISSUE:swp") .desc("number of swp insts issued") .flags(total) ; - exe_nop + exeNop .init(cpu->number_of_threads) .name(name() + ".ISSUE:nop") .desc("number of nop insts issued") .flags(total) ; - exe_refs + exeRefs .init(cpu->number_of_threads) .name(name() + ".ISSUE:refs") .desc("number of memory reference insts issued") .flags(total) ; - exe_loads + exeLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:loads") .desc("number of load insts issued") .flags(total) ; - exe_branches + exeBranches .init(cpu->number_of_threads) .name(name() + ".ISSUE:branches") .desc("Number of branches issued") .flags(total) ; - issued_ops + issuedOps .init(cpu->number_of_threads) .name(name() + ".ISSUE:op_count") .desc("number of insts issued") @@ -272,28 +274,28 @@ LWBackEnd<Impl>::regStats() // // Other stats // - lsq_forw_loads + lsqForwLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:forw_loads") .desc("number of loads forwarded via LSQ") .flags(total) ; - inv_addr_loads + invAddrLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_loads") .desc("number of invalid-address loads") .flags(total) ; - inv_addr_swpfs + invAddrSwpfs .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_swpfs") .desc("number of invalid-address SW prefetches") .flags(total) ; - lsq_blocked_loads + lsqBlockedLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:blocked_loads") .desc("number of ready loads not issued due to memory disambiguation") @@ -305,51 +307,52 @@ LWBackEnd<Impl>::regStats() .desc("Number of times LSQ instruction issued early") ; - n_issued_dist + nIssuedDist .init(issueWidth + 1) .name(name() + ".ISSUE:issued_per_cycle") .desc("Number of insts issued each cycle") .flags(total | pdf | dist) ; - issue_delay_dist +/* + issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") .desc("cycles from operands ready to issue") .flags(pdf | cdf) ; - queue_res_dist + queueResDist .init(Num_OpClasses, 0, 99, 2) .name(name() + ".IQ:residence:") .desc("cycles from dispatch to issue") .flags(total | pdf | cdf ) ; for (int i = 0; i < Num_OpClasses; ++i) { - queue_res_dist.subname(i, opClassStrings[i]); + queueResDist.subname(i, opClassStrings[i]); } - - writeback_count +*/ + writebackCount .init(cpu->number_of_threads) .name(name() + ".WB:count") .desc("cumulative count of insts written-back") .flags(total) ; - producer_inst + producerInst .init(cpu->number_of_threads) .name(name() + ".WB:producers") .desc("num instructions producing a value") .flags(total) ; - consumer_inst + consumerInst .init(cpu->number_of_threads) .name(name() + ".WB:consumers") .desc("num instructions consuming a value") .flags(total) ; - wb_penalized + wbPenalized .init(cpu->number_of_threads) .name(name() + ".WB:penalized") .desc("number of instrctions required to write to 'other' IQ") @@ -357,71 +360,71 @@ LWBackEnd<Impl>::regStats() ; - wb_penalized_rate + wbPenalizedRate .name(name() + ".WB:penalized_rate") .desc ("fraction of instructions written-back that wrote to 'other' IQ") .flags(total) ; - wb_penalized_rate = wb_penalized / writeback_count; + wbPenalizedRate = wbPenalized / writebackCount; - wb_fanout + wbFanout .name(name() + ".WB:fanout") .desc("average fanout of values written-back") .flags(total) ; - wb_fanout = producer_inst / consumer_inst; + wbFanout = producerInst / consumerInst; - wb_rate + wbRate .name(name() + ".WB:rate") .desc("insts written-back per cycle") .flags(total) ; - wb_rate = writeback_count / cpu->numCycles; + wbRate = writebackCount / cpu->numCycles; - stat_com_inst + statComInst .init(cpu->number_of_threads) .name(name() + ".COM:count") .desc("Number of instructions committed") .flags(total) ; - stat_com_swp + statComSwp .init(cpu->number_of_threads) .name(name() + ".COM:swp_count") .desc("Number of s/w prefetches committed") .flags(total) ; - stat_com_refs + statComRefs .init(cpu->number_of_threads) .name(name() + ".COM:refs") .desc("Number of memory references committed") .flags(total) ; - stat_com_loads + statComLoads .init(cpu->number_of_threads) .name(name() + ".COM:loads") .desc("Number of loads committed") .flags(total) ; - stat_com_membars + statComMembars .init(cpu->number_of_threads) .name(name() + ".COM:membars") .desc("Number of memory barriers committed") .flags(total) ; - stat_com_branches + statComBranches .init(cpu->number_of_threads) .name(name() + ".COM:branches") .desc("Number of branches committed") .flags(total) ; - n_committed_dist + nCommittedDist .init(0,commitWidth,1) .name(name() + ".COM:committed_per_cycle") .desc("Number of insts commited each cycle") @@ -441,14 +444,14 @@ LWBackEnd<Impl>::regStats() // -> The standard deviation is computed only over cycles where // we reached the BW limit // - commit_eligible + commitEligible .init(cpu->number_of_threads) .name(name() + ".COM:bw_limited") .desc("number of insts not committed due to BW limits") .flags(total) ; - commit_eligible_samples + commitEligibleSamples .name(name() + ".COM:bw_lim_events") .desc("number cycles where commit BW limit reached") ; @@ -465,37 +468,38 @@ LWBackEnd<Impl>::regStats() .desc("Number of instructions removed from inst list when they reached the head of the ROB") ; - ROB_fcount + ROBFcount .name(name() + ".ROB:full_count") .desc("number of cycles where ROB was full") ; - ROB_count + ROBCount .init(cpu->number_of_threads) .name(name() + ".ROB:occupancy") .desc(name() + ".ROB occupancy (cumulative)") .flags(total) ; - ROB_full_rate + ROBFullRate .name(name() + ".ROB:full_rate") .desc("ROB full per cycle") ; - ROB_full_rate = ROB_fcount / cpu->numCycles; + ROBFullRate = ROBFcount / cpu->numCycles; - ROB_occ_rate + ROBOccRate .name(name() + ".ROB:occ_rate") .desc("ROB occupancy rate") .flags(total) ; - ROB_occ_rate = ROB_count / cpu->numCycles; - - ROB_occ_dist + ROBOccRate = ROBCount / cpu->numCycles; +/* + ROBOccDist .init(cpu->number_of_threads,0,numROBEntries,2) .name(name() + ".ROB:occ_dist") .desc("ROB Occupancy per cycle") .flags(total | cdf) ; +*/ } template <class Impl> @@ -588,17 +592,21 @@ LWBackEnd<Impl>::tick() { DPRINTF(BE, "Ticking back end\n"); + // Read in any done instruction information and update the IQ or LSQ. + updateStructures(); + if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) { cpu->signalSwitched(); return; } - ROB_count[0]+= numInsts; + readyInstsForCommit(); - wbCycle = 0; + numInstsToWB.advance(); - // Read in any done instruction information and update the IQ or LSQ. - updateStructures(); + ROBCount[0]+= numInsts; + + wbCycle = 0; #if FULL_SYSTEM checkInterrupts(); @@ -674,6 +682,10 @@ LWBackEnd<Impl>::dispatchInsts() while (numInsts < numROBEntries && numWaitingMemOps < maxOutstandingMemOps) { // Get instruction from front of time buffer + if (lsqLimits && LSQ.isFull()) { + break; + } + DynInstPtr inst = frontEnd->getInst(); if (!inst) { break; @@ -732,6 +744,7 @@ LWBackEnd<Impl>::dispatchInsts() inst->setIssued(); inst->setExecuted(); inst->setCanCommit(); + numInstsToWB[0]++; } else { DPRINTF(BE, "Instruction [sn:%lli] ready, addding to " "exeList.\n", @@ -866,8 +879,17 @@ LWBackEnd<Impl>::executeInsts() if (inst->isLoad()) { LSQ.executeLoad(inst); } else if (inst->isStore()) { - LSQ.executeStore(inst); - if (inst->req && !(inst->req->getFlags() & LOCKED)) { + Fault fault = LSQ.executeStore(inst); + + if (!inst->isStoreConditional() && fault == NoFault) { + inst->setExecuted(); + + instToCommit(inst); + } else if (fault != NoFault) { + // If the instruction faulted, then we need to send it along to commit + // without the instruction completing. + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. inst->setExecuted(); instToCommit(inst); @@ -908,36 +930,54 @@ LWBackEnd<Impl>::executeInsts() } } - issued_ops[0]+= num_executed; - n_issued_dist[num_executed]++; + issuedOps[0]+= num_executed; + nIssuedDist[num_executed]++; } template<class Impl> void LWBackEnd<Impl>::instToCommit(DynInstPtr &inst) { - DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n", inst->seqNum, inst->readPC()); if (!inst->isSquashed()) { - DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", - inst->seqNum, inst->readPC()); - - inst->setCanCommit(); - if (inst->isExecuted()) { inst->setResultReady(); int dependents = wakeDependents(inst); if (dependents) { - producer_inst[0]++; - consumer_inst[0]+= dependents; + producerInst[0]++; + consumerInst[0]+= dependents; } } } - writeback_count[0]++; + writeback.push_back(inst); + + numInstsToWB[0]++; + + writebackCount[0]++; +} + +template <class Impl> +void +LWBackEnd<Impl>::readyInstsForCommit() +{ + for (int i = numInstsToWB[-latency]; + !writeback.empty() && i; + --i) + { + DynInstPtr inst = writeback.front(); + writeback.pop_front(); + if (!inst->isSquashed()) { + DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", + inst->seqNum, inst->readPC()); + + inst->setCanCommit(); + } + } } + #if 0 template <class Impl> void @@ -1010,7 +1050,7 @@ LWBackEnd<Impl>::commitInst(int inst_num) // or store inst. Signal backwards that it should be executed. if (!inst->isExecuted()) { if (inst->isNonSpeculative() || - inst->isStoreConditional() || + (inst->isStoreConditional() && inst->getFault() == NoFault) || inst->isMemBarrier() || inst->isWriteBarrier()) { #if !FULL_SYSTEM @@ -1151,6 +1191,20 @@ LWBackEnd<Impl>::commitInst(int inst_num) ++freed_regs; } +#if FULL_SYSTEM + if (thread->profile) { +// bool usermode = +// (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0; +// thread->profilePC = usermode ? 1 : inst->readPC(); + thread->profilePC = inst->readPC(); + ProfileNode *node = thread->profile->consume(thread->getXCProxy(), + inst->staticInst); + + if (node) + thread->profileNode = node; + } +#endif + if (inst->traceData) { inst->traceData->setFetchSeq(inst->seqNum); inst->traceData->setCPSeq(thread->numInst); @@ -1158,6 +1212,9 @@ LWBackEnd<Impl>::commitInst(int inst_num) inst->traceData = NULL; } + if (inst->isCopy()) + panic("Should not commit any copy instructions!"); + inst->clearDependents(); frontEnd->addFreeRegs(freed_regs); @@ -1207,9 +1264,9 @@ LWBackEnd<Impl>::commitInsts() while (!instList.empty() && inst_num < commitWidth) { if (instList.back()->isSquashed()) { instList.back()->clearDependents(); + ROBSquashedInsts[instList.back()->threadNumber]++; instList.pop_back(); --numInsts; - ROBSquashedInsts[instList.back()->threadNumber]++; continue; } @@ -1221,7 +1278,7 @@ LWBackEnd<Impl>::commitInsts() break; } } - n_committed_dist.sample(inst_num); + nCommittedDist.sample(inst_num); } template <class Impl> @@ -1231,10 +1288,10 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) LSQ.squash(sn); int freed_regs = 0; - InstListIt waiting_list_end = waitingList.end(); + InstListIt insts_end_it = waitingList.end(); InstListIt insts_it = waitingList.begin(); - while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn) + while (insts_it != insts_end_it && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { ++insts_it; @@ -1260,6 +1317,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) while (!instList.empty() && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { + panic("Instruction should not be already squashed and on list!"); ++insts_it; continue; } @@ -1291,18 +1349,6 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) --numInsts; } - insts_it = waitingList.begin(); - while (!waitingList.empty() && insts_it != waitingList.end()) { - if ((*insts_it)->seqNum < sn) { - ++insts_it; - continue; - } - assert((*insts_it)->isSquashed()); - - waitingList.erase(insts_it++); - waitingInsts--; - } - while (memBarrier && memBarrier->seqNum > sn) { DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously " "squashed)\n", memBarrier->seqNum); @@ -1320,6 +1366,18 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) } } + insts_it = replayList.begin(); + insts_end_it = replayList.end(); + while (!replayList.empty() && insts_it != insts_end_it) { + if ((*insts_it)->seqNum < sn) { + ++insts_it; + continue; + } + assert((*insts_it)->isSquashed()); + + replayList.erase(insts_it++); + } + frontEnd->addFreeRegs(freed_regs); } @@ -1392,14 +1450,6 @@ LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst) template <class Impl> void -LWBackEnd<Impl>::fetchFault(Fault &fault) -{ - faultFromFetch = fault; - fetchHasFault = true; -} - -template <class Impl> -void LWBackEnd<Impl>::switchOut() { switchPending = true; @@ -1416,17 +1466,25 @@ LWBackEnd<Impl>::doSwitchOut() // yet written back. assert(robEmpty()); assert(!LSQ.hasStoresToWB()); - + writeback.clear(); + for (int i = 0; i < numInstsToWB.getSize() + 1; ++i) + numInstsToWB.advance(); + +// squash(0); + assert(waitingList.empty()); + assert(instList.empty()); + assert(replayList.empty()); + assert(writeback.empty()); LSQ.switchOut(); - - squash(0); } template <class Impl> void LWBackEnd<Impl>::takeOverFrom(ThreadContext *old_tc) { - switchedOut = false; + assert(!squashPending); + squashSeqNum = 0; + squashNextPC = 0; tcSquash = false; trapSquash = false; @@ -1451,27 +1509,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) - exe_swp[thread_number]++; + exeSwp[thread_number]++; else - exe_inst[thread_number]++; + exeInst[thread_number]++; #else - exe_inst[thread_number]++; + exeInst[thread_number]++; #endif // // Control operations // if (inst->isControl()) - exe_branches[thread_number]++; + exeBranches[thread_number]++; // // Memory operations // if (inst->isMemRef()) { - exe_refs[thread_number]++; + exeRefs[thread_number]++; if (inst->isLoad()) - exe_loads[thread_number]++; + exeLoads[thread_number]++; } } @@ -1491,33 +1549,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) { - stat_com_swp[tid]++; + statComSwp[tid]++; } else { - stat_com_inst[tid]++; + statComInst[tid]++; } #else - stat_com_inst[tid]++; + statComInst[tid]++; #endif // // Control Instructions // if (inst->isControl()) - stat_com_branches[tid]++; + statComBranches[tid]++; // // Memory references // if (inst->isMemRef()) { - stat_com_refs[tid]++; + statComRefs[tid]++; if (inst->isLoad()) { - stat_com_loads[tid]++; + statComLoads[tid]++; } } if (inst->isMemBarrier()) { - stat_com_membars[tid]++; + statComMembars[tid]++; } } @@ -1569,6 +1627,45 @@ LWBackEnd<Impl>::dumpInsts() ++num; } + inst_list_it = --(writeback.end()); + + cprintf("Writeback list size: %i\n", writeback.size()); + + while (inst_list_it != writeback.end()) + { + cprintf("Instruction:%i\n", + num); + if (!(*inst_list_it)->isSquashed()) { + if (!(*inst_list_it)->isIssued()) { + ++valid_num; + cprintf("Count:%i\n", valid_num); + } else if ((*inst_list_it)->isMemRef() && + !(*inst_list_it)->memOpDone) { + // Loads that have not been marked as executed still count + // towards the total instructions. + ++valid_num; + cprintf("Count:%i\n", valid_num); + } + } + + cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n" + "Issued:%i\nSquashed:%i\n", + (*inst_list_it)->readPC(), + (*inst_list_it)->seqNum, + (*inst_list_it)->threadNumber, + (*inst_list_it)->isIssued(), + (*inst_list_it)->isSquashed()); + + if ((*inst_list_it)->isMemRef()) { + cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone); + } + + cprintf("\n"); + + inst_list_it--; + ++num; + } + cprintf("Waiting list size: %i\n", waitingList.size()); inst_list_it = --(waitingList.end()); diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh index 9a21a9d01..6640a9f34 100644 --- a/src/cpu/ozone/lw_lsq.hh +++ b/src/cpu/ozone/lw_lsq.hh @@ -84,6 +84,8 @@ class OzoneLWLSQ { /** Returns the name of the LSQ unit. */ std::string name() const; + void regStats(); + /** Sets the CPU pointer. */ void setCPU(OzoneCPU *cpu_ptr); @@ -179,7 +181,7 @@ class OzoneLWLSQ { int numLoads() { return loads; } /** Returns the number of stores in the SQ. */ - int numStores() { return stores; } + int numStores() { return stores + storesInFlight; } /** Returns if either the LQ or SQ is full. */ bool isFull() { return lqFull() || sqFull(); } @@ -188,7 +190,7 @@ class OzoneLWLSQ { bool lqFull() { return loads >= (LQEntries - 1); } /** Returns if the SQ is full. */ - bool sqFull() { return stores >= (SQEntries - 1); } + bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); } /** Debugging function to dump instructions in the LSQ. */ void dumpInsts(); @@ -223,7 +225,9 @@ class OzoneLWLSQ { void storePostSend(Packet *pkt, DynInstPtr &inst); /** Completes the store at the specified index. */ - void completeStore(int store_idx); + void completeStore(DynInstPtr &inst); + + void removeStore(int store_idx); /** Handles doing the retry. */ void recvRetry(); @@ -394,6 +398,10 @@ class OzoneLWLSQ { int storesToWB; + public: + int storesInFlight; + + private: /// @todo Consider moving to a more advanced model with write vs read ports /** The number of cache ports available each cycle. */ int cachePorts; @@ -403,6 +411,9 @@ class OzoneLWLSQ { //list<InstSeqNum> mshrSeqNums; + /** Tota number of memory ordering violations. */ + Stats::Scalar<> lsqMemOrderViolation; + //Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; @@ -525,7 +536,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx) store_size = (*sq_it).size; - if (store_size == 0) { + if (store_size == 0 || (*sq_it).committed) { sq_it++; continue; } diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh index 7eef4b11f..31ffa9d67 100644 --- a/src/cpu/ozone/lw_lsq_impl.hh +++ b/src/cpu/ozone/lw_lsq_impl.hh @@ -132,7 +132,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt) template <class Impl> OzoneLWLSQ<Impl>::OzoneLWLSQ() : switchedOut(false), dcachePort(this), loads(0), stores(0), - storesToWB(0), stalled(false), isStoreBlocked(false), + storesToWB(0), storesInFlight(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), loadBlockedHandled(false) { } @@ -173,6 +173,11 @@ OzoneLWLSQ<Impl>::name() const template<class Impl> void +OzoneLWLSQ<Impl>::regStats() +{ + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); OzoneLWLSQ<Impl>::setCPU(OzoneCPU *cpu_ptr) { cpu = cpu_ptr; @@ -321,7 +326,7 @@ unsigned OzoneLWLSQ<Impl>::numFreeEntries() { unsigned free_lq_entries = LQEntries - loads; - unsigned free_sq_entries = SQEntries - stores; + unsigned free_sq_entries = SQEntries - (stores + storesInFlight); // Both the LQ and SQ entries have an extra dummy entry to differentiate // empty/full conditions. Subtract 1 from the free entries. @@ -385,6 +390,9 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst) // Actually probably want the oldest faulting load if (load_fault != NoFault) { DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum); + if (!(inst->req->flags & UNCACHEABLE && !inst->isAtCommit())) { + inst->setExecuted(); + } // Maybe just set it as can commit here, although that might cause // some other problems with sending traps to the ROB too quickly. be->instToCommit(inst); @@ -461,6 +469,7 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = (*lq_it); + ++lsqMemOrderViolation; return TheISA::genMachineCheckFault(); } @@ -553,8 +562,8 @@ OzoneLWLSQ<Impl>::writebackStores() if ((*sq_it).size == 0 && !(*sq_it).completed) { sq_it--; - completeStore(inst->sqIdx); - + removeStore(inst->sqIdx); + completeStore(inst); continue; } @@ -626,6 +635,8 @@ OzoneLWLSQ<Impl>::writebackStores() inst->sqIdx,inst->readPC(), req->paddr, *(req->data), inst->seqNum); + DPRINTF(OzoneLSQ, "StoresInFlight: %i\n", + storesInFlight + 1); if (dcacheInterface) { assert(!req->completionEvent); @@ -687,6 +698,8 @@ OzoneLWLSQ<Impl>::writebackStores() } sq_it--; } + ++storesInFlight; +// removeStore(inst->sqIdx); } else { panic("Must HAVE DCACHE!!!!!\n"); } @@ -704,7 +717,7 @@ void OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num) { DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!" - "(Loads:%i Stores:%i)\n",squashed_num,loads,stores); + "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight); LQIt lq_it = loadQueue.begin(); @@ -881,7 +894,7 @@ OzoneLWLSQ<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) template <class Impl> void -OzoneLWLSQ<Impl>::completeStore(int store_idx) +OzoneLWLSQ<Impl>::removeStore(int store_idx) { SQHashIt sq_hash_it = SQItHash.find(store_idx); assert(sq_hash_it != SQItHash.end()); @@ -891,8 +904,6 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) (*sq_it).completed = true; DynInstPtr inst = (*sq_it).inst; - --storesToWB; - if (isStalled() && inst->seqNum == stallingStoreIsn) { DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] " @@ -910,6 +921,13 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) SQItHash.erase(sq_hash_it); SQIndices.push(inst->sqIdx); storeQueue.erase(sq_it); +} + +template <class Impl> +void +OzoneLWLSQ<Impl>::completeStore(DynInstPtr &inst) +{ + --storesToWB; --stores; inst->setCompleted(); @@ -935,9 +953,14 @@ OzoneLWLSQ<Impl>::switchOut() switchedOut = true; // Clear the queue to free up resources + assert(stores == 0); + assert(storeQueue.empty()); + assert(loads == 0); + assert(loadQueue.empty()); + assert(storesInFlight == 0); storeQueue.clear(); loadQueue.clear(); - loads = stores = storesToWB = 0; + loads = stores = storesToWB = storesInFlight = 0; } template <class Impl> diff --git a/src/cpu/ozone/simple_params.hh b/src/cpu/ozone/simple_params.hh index 11cee716f..3f63d2e1d 100644 --- a/src/cpu/ozone/simple_params.hh +++ b/src/cpu/ozone/simple_params.hh @@ -71,10 +71,11 @@ class SimpleParams : public BaseCPU::Params unsigned cachePorts; unsigned width; + unsigned frontEndLatency; unsigned frontEndWidth; + unsigned backEndLatency; unsigned backEndWidth; unsigned backEndSquashLatency; - unsigned backEndLatency; unsigned maxInstBufferSize; unsigned numPhysicalRegs; unsigned maxOutstandingMemOps; @@ -150,6 +151,7 @@ class SimpleParams : public BaseCPU::Params // unsigned LQEntries; unsigned SQEntries; + bool lsqLimits; // // Memory dependence diff --git a/src/cpu/ozone/thread_state.hh b/src/cpu/ozone/thread_state.hh index 8234cf938..adaa8e71b 100644 --- a/src/cpu/ozone/thread_state.hh +++ b/src/cpu/ozone/thread_state.hh @@ -34,9 +34,12 @@ #include "arch/faults.hh" #include "arch/types.hh" #include "arch/regfile.hh" +#include "base/callback.hh" +#include "base/output.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" #include "sim/process.hh" +#include "sim/sim_exit.hh" class Event; //class Process; @@ -65,8 +68,21 @@ struct OzoneThreadState : public ThreadState { #if FULL_SYSTEM OzoneThreadState(CPUType *_cpu, int _thread_num) : ThreadState(-1, _thread_num), - intrflag(0), inSyscall(0), trapPending(0) + cpu(_cpu), intrflag(0), inSyscall(0), trapPending(0) { + if (cpu->params->profile) { + profile = new FunctionProfile(cpu->params->system->kernelSymtab); + Callback *cb = + new MakeCallback<OzoneThreadState, + &OzoneThreadState::dumpFuncProfile>(this); + registerExitCallback(cb); + } + + // let's fill with a dummy node for now so we don't get a segfault + // on the first cycle when there's no node available. + static ProfileNode dummyNode; + profileNode = &dummyNode; + profilePC = 3; miscRegFile.clear(); } #else @@ -130,6 +146,14 @@ struct OzoneThreadState : public ThreadState { void setNextPC(uint64_t val) { nextPC = val; } + +#if FULL_SYSTEM + void dumpFuncProfile() + { + std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name())); + profile->dump(xcProxy, *os); + } +#endif }; #endif // __CPU_OZONE_THREAD_STATE_HH__ |