summaryrefslogtreecommitdiff
path: root/src/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu')
-rw-r--r--src/cpu/base.cc54
-rw-r--r--src/cpu/base.hh20
-rw-r--r--src/cpu/base_dyn_inst.hh7
-rw-r--r--src/cpu/checker/cpu.hh19
-rw-r--r--src/cpu/checker/cpu_impl.hh25
-rw-r--r--src/cpu/o3/alpha/cpu_builder.cc11
-rw-r--r--src/cpu/o3/checker_builder.cc13
-rw-r--r--src/cpu/o3/commit_impl.hh19
-rw-r--r--src/cpu/o3/cpu.cc43
-rw-r--r--src/cpu/o3/fetch_impl.hh9
-rw-r--r--src/cpu/o3/iew.hh8
-rw-r--r--src/cpu/o3/iew_impl.hh45
-rw-r--r--src/cpu/o3/inst_queue.hh4
-rw-r--r--src/cpu/o3/inst_queue_impl.hh28
-rw-r--r--src/cpu/o3/lsq_impl.hh10
-rw-r--r--src/cpu/o3/lsq_unit.hh11
-rw-r--r--src/cpu/o3/lsq_unit_impl.hh14
-rw-r--r--src/cpu/o3/mem_dep_unit_impl.hh3
-rw-r--r--src/cpu/o3/rename.hh2
-rw-r--r--src/cpu/o3/rename_impl.hh13
-rw-r--r--src/cpu/o3/thread_state.hh29
-rw-r--r--src/cpu/o3/tournament_pred.cc10
-rw-r--r--src/cpu/o3/tournament_pred.hh3
-rw-r--r--src/cpu/ozone/checker_builder.cc13
-rw-r--r--src/cpu/ozone/cpu.hh36
-rw-r--r--src/cpu/ozone/cpu_builder.cc21
-rw-r--r--src/cpu/ozone/cpu_impl.hh118
-rw-r--r--src/cpu/ozone/front_end.hh7
-rw-r--r--src/cpu/ozone/front_end_impl.hh74
-rw-r--r--src/cpu/ozone/inorder_back_end_impl.hh2
-rw-r--r--src/cpu/ozone/inst_queue_impl.hh8
-rw-r--r--src/cpu/ozone/lw_back_end.hh94
-rw-r--r--src/cpu/ozone/lw_back_end_impl.hh317
-rw-r--r--src/cpu/ozone/lw_lsq.hh19
-rw-r--r--src/cpu/ozone/lw_lsq_impl.hh41
-rw-r--r--src/cpu/ozone/simple_params.hh4
-rw-r--r--src/cpu/ozone/thread_state.hh26
-rw-r--r--src/cpu/simple/base.cc2
-rw-r--r--src/cpu/simple_thread.cc5
-rw-r--r--src/cpu/thread_state.hh16
40 files changed, 897 insertions, 306 deletions
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index ce440aeff..f00dad7d6 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -48,6 +48,9 @@
#include "base/trace.hh"
+// Hack
+#include "sim/stat_control.hh"
+
using namespace std;
vector<BaseCPU *> BaseCPU::cpuList;
@@ -57,6 +60,30 @@ vector<BaseCPU *> BaseCPU::cpuList;
// been initialized
int maxThreadsPerCPU = 1;
+void
+CPUProgressEvent::process()
+{
+ Counter temp = cpu->totalInstructions();
+#ifndef NDEBUG
+ double ipc = double(temp - lastNumInst) / (interval / cpu->cycles(1));
+
+ DPRINTFN("%s progress event, instructions committed: %lli, IPC: %0.8d\n",
+ cpu->name(), temp - lastNumInst, ipc);
+ ipc = 0.0;
+#else
+ cprintf("%lli: %s progress event, instructions committed: %lli\n",
+ curTick, cpu->name(), temp - lastNumInst);
+#endif
+ lastNumInst = temp;
+ schedule(curTick + interval);
+}
+
+const char *
+CPUProgressEvent::description()
+{
+ return "CPU Progress event";
+}
+
#if FULL_SYSTEM
BaseCPU::BaseCPU(Params *p)
: MemObject(p->name), clock(p->clock), checkInterrupts(true),
@@ -67,6 +94,7 @@ BaseCPU::BaseCPU(Params *p)
number_of_threads(p->numberOfThreads), system(p->system)
#endif
{
+// currentTick = curTick;
DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this);
// add self to global list of CPUs
@@ -128,6 +156,12 @@ BaseCPU::BaseCPU(Params *p)
p->max_loads_all_threads, *counter);
}
+ if (p->stats_reset_inst != 0) {
+ Stats::SetupEvent(Stats::Reset, p->stats_reset_inst, 0, comInstEventQueue[0]);
+ cprintf("Stats reset event scheduled for %lli insts\n",
+ p->stats_reset_inst);
+ }
+
#if FULL_SYSTEM
memset(interrupts, 0, sizeof(interrupts));
intstatus = 0;
@@ -153,7 +187,6 @@ BaseCPU::BaseCPU(Params *p)
if (params->profile)
profileEvent = new ProfileEvent(this, params->profile);
#endif
-
}
BaseCPU::Params::Params()
@@ -188,6 +221,11 @@ BaseCPU::startup()
if (!params->deferRegistration && profileEvent)
profileEvent->schedule(curTick);
#endif
+
+ if (params->progress_interval) {
+ new CPUProgressEvent(&mainEventQueue, params->progress_interval,
+ this);
+ }
}
@@ -238,7 +276,11 @@ BaseCPU::registerThreadContexts()
void
BaseCPU::switchOut()
{
- panic("This CPU doesn't support sampling!");
+// panic("This CPU doesn't support sampling!");
+#if FULL_SYSTEM
+ if (profileEvent && profileEvent->scheduled())
+ profileEvent->deschedule();
+#endif
}
void
@@ -261,18 +303,22 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU)
assert(newTC->getProcessPtr() == oldTC->getProcessPtr());
newTC->getProcessPtr()->replaceThreadContext(newTC, newTC->readCpuId());
#endif
+
+// TheISA::compareXCs(oldXC, newXC);
}
#if FULL_SYSTEM
for (int i = 0; i < TheISA::NumInterruptLevels; ++i)
interrupts[i] = oldCPU->interrupts[i];
intstatus = oldCPU->intstatus;
+ checkInterrupts = oldCPU->checkInterrupts;
for (int i = 0; i < threadContexts.size(); ++i)
threadContexts[i]->profileClear();
- if (profileEvent)
- profileEvent->schedule(curTick);
+ // The Sampler must take care of this!
+// if (profileEvent)
+// profileEvent->schedule(curTick);
#endif
}
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 2be6e4e81..2a3fd9b56 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -46,6 +46,23 @@ class ThreadContext;
class System;
class Port;
+class CPUProgressEvent : public Event
+{
+ protected:
+ Tick interval;
+ Counter lastNumInst;
+ BaseCPU *cpu;
+
+ public:
+ CPUProgressEvent(EventQueue *q, Tick ival, BaseCPU *_cpu)
+ : Event(q, Event::Stat_Event_Pri), interval(ival), lastNumInst(0), cpu(_cpu)
+ { schedule(curTick + interval); }
+
+ void process();
+
+ virtual const char *description();
+};
+
class BaseCPU : public MemObject
{
protected:
@@ -53,6 +70,7 @@ class BaseCPU : public MemObject
Tick clock;
public:
+// Tick currentTick;
inline Tick frequency() const { return Clock::Frequency / clock; }
inline Tick cycles(int numCycles) const { return clock * numCycles; }
inline Tick curCycle() const { return curTick / clock; }
@@ -120,6 +138,7 @@ class BaseCPU : public MemObject
Counter max_insts_all_threads;
Counter max_loads_any_thread;
Counter max_loads_all_threads;
+ Counter stats_reset_inst;
Tick clock;
bool functionTrace;
Tick functionTraceStart;
@@ -128,6 +147,7 @@ class BaseCPU : public MemObject
int cpu_id;
Tick profile;
#endif
+ Tick progress_interval;
BaseCPU *checker;
Params();
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 3158aa9cf..926bfcbb2 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -197,7 +197,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
union Result {
uint64_t integer;
- float fp;
+// float fp;
double dbl;
};
@@ -394,7 +394,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
uint64_t readIntResult() { return instResult.integer; }
/** Returns the result of a floating point instruction. */
- float readFloatResult() { return instResult.fp; }
+ float readFloatResult() { return (float)instResult.dbl; }
/** Returns the result of a floating point (double) instruction. */
double readDoubleResult() { return instResult.dbl; }
@@ -419,7 +419,8 @@ class BaseDynInst : public FastAlloc, public RefCounted
/** Records an fp register being set to a value. */
void setFloatReg(const StaticInst *si, int idx, FloatReg val)
{
- instResult.fp = val;
+// instResult.fp = val;
+ instResult.dbl = (double)val;
}
/** Records an fp register being set to an integer value. */
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index 6d6ae1e0a..737b4b5d4 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -102,6 +102,7 @@ class CheckerCPU : public BaseCPU
Process *process;
#endif
bool exitOnError;
+ bool updateOnError;
bool warnOnlyOnLoadError;
};
@@ -148,7 +149,7 @@ class CheckerCPU : public BaseCPU
union Result {
uint64_t integer;
- float fp;
+// float fp;
double dbl;
};
@@ -269,7 +270,7 @@ class CheckerCPU : public BaseCPU
{
int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
thread->setFloatReg(reg_idx, val);
- result.fp = val;
+ result.dbl = (double)val;
}
void setFloatRegBits(const StaticInst *si, int idx, FloatRegBits val,
@@ -318,7 +319,7 @@ class CheckerCPU : public BaseCPU
return thread->setMiscRegWithEffect(misc_reg, val);
}
- void recordPCChange(uint64_t val) { changedPC = true; }
+ void recordPCChange(uint64_t val) { changedPC = true; newPC = val; }
void recordNextPCChange(uint64_t val) { changedNextPC = true; }
bool translateInstReq(Request *req);
@@ -360,6 +361,7 @@ class CheckerCPU : public BaseCPU
uint64_t newPC;
bool changedNextPC;
bool exitOnError;
+ bool updateOnError;
bool warnOnlyOnLoadError;
InstSeqNum youngestSN;
@@ -376,7 +378,7 @@ class Checker : public CheckerCPU
{
public:
Checker(Params *p)
- : CheckerCPU(p)
+ : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL)
{ }
void switchOut();
@@ -393,12 +395,19 @@ class Checker : public CheckerCPU
private:
void handleError(DynInstPtr &inst)
{
- if (exitOnError)
+ if (exitOnError) {
dumpAndExit(inst);
+ } else if (updateOnError) {
+ updateThisCycle = true;
+ }
}
void dumpAndExit(DynInstPtr &inst);
+ bool updateThisCycle;
+
+ DynInstPtr unverifiedInst;
+
std::list<DynInstPtr> instList;
typedef typename std::list<DynInstPtr>::iterator InstListIt;
void dumpInsts();
diff --git a/src/cpu/checker/cpu_impl.hh b/src/cpu/checker/cpu_impl.hh
index 81f97726c..3bb81c4b9 100644
--- a/src/cpu/checker/cpu_impl.hh
+++ b/src/cpu/checker/cpu_impl.hh
@@ -94,6 +94,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
}
}
+ unverifiedInst = inst;
+
// Try to check all instructions that are completed, ending if we
// run out of instructions to check or if an instruction is not
// yet completed.
@@ -171,7 +173,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
thread->setPC(thread->readNextPC());
thread->setNextPC(thread->readNextPC() + sizeof(MachInst));
- return;
+ break;
} else {
// The instruction is carrying an ITB fault. Handle
// the fault and see if our results match the CPU on
@@ -220,7 +222,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
thread->funcExeInst++;
- fault = curStaticInst->execute(this, NULL);
+ if (!inst->isUnverifiable())
+ fault = curStaticInst->execute(this, NULL);
// Checks to make sure instrution results are correct.
validateExecution(inst);
@@ -289,6 +292,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
break;
}
}
+ unverifiedInst = NULL;
}
template <class DynInstPtr>
@@ -395,6 +399,23 @@ template <class DynInstPtr>
void
Checker<DynInstPtr>::validateState()
{
+ if (updateThisCycle) {
+ warn("%lli: Instruction PC %#x results didn't match up, copying all "
+ "registers from main CPU", curTick, unverifiedInst->readPC());
+ // Heavy-weight copying of all registers
+ cpuXC->copyArchRegs(unverifiedInst->xcBase());
+ // Also advance the PC. Hopefully no PC-based events happened.
+#if THE_ISA != MIPS_ISA
+ // go to the next instruction
+ cpuXC->setPC(cpuXC->readNextPC());
+ cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst));
+#else
+ // go to the next instruction
+ cpuXC->setPC(cpuXC->readNextPC());
+ cpuXC->setNextPC(cpuXC->readNextNPC());
+ cpuXC->setNextNPC(cpuXC->readNextNPC() + sizeof(MachInst));
+#endif
+ updateThisCycle = false;
}
template <class DynInstPtr>
diff --git a/src/cpu/o3/alpha/cpu_builder.cc b/src/cpu/o3/alpha/cpu_builder.cc
index 5e767655d..fbf1f342c 100644
--- a/src/cpu/o3/alpha/cpu_builder.cc
+++ b/src/cpu/o3/alpha/cpu_builder.cc
@@ -56,6 +56,7 @@ SimObjectParam<System *> system;
Param<int> cpu_id;
SimObjectParam<AlphaITB *> itb;
SimObjectParam<AlphaDTB *> dtb;
+Param<Tick> profile;
#else
SimObjectVectorParam<Process *> workload;
#endif // FULL_SYSTEM
@@ -68,6 +69,8 @@ Param<Counter> max_insts_any_thread;
Param<Counter> max_insts_all_threads;
Param<Counter> max_loads_any_thread;
Param<Counter> max_loads_all_threads;
+Param<Counter> stats_reset_inst;
+Param<Tick> progress_interval;
Param<unsigned> cachePorts;
@@ -162,6 +165,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
INIT_PARAM(cpu_id, "processor ID"),
INIT_PARAM(itb, "Instruction translation buffer"),
INIT_PARAM(dtb, "Data translation buffer"),
+ INIT_PARAM(profile, ""),
#else
INIT_PARAM(workload, "Processes to run"),
#endif // FULL_SYSTEM
@@ -184,6 +188,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
"Terminate when all threads have reached this load"
"count",
0),
+ INIT_PARAM_DFLT(stats_reset_inst,
+ "blah",
+ 0),
+ INIT_PARAM_DFLT(progress_interval, "Progress interval", 0),
INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
@@ -305,6 +313,7 @@ CREATE_SIM_OBJECT(DerivO3CPU)
params->cpu_id = cpu_id;
params->itb = itb;
params->dtb = dtb;
+ params->profile = profile;
#else
params->workload = workload;
#endif // FULL_SYSTEM
@@ -317,6 +326,8 @@ CREATE_SIM_OBJECT(DerivO3CPU)
params->max_insts_all_threads = max_insts_all_threads;
params->max_loads_any_thread = max_loads_any_thread;
params->max_loads_all_threads = max_loads_all_threads;
+ params->stats_reset_inst = stats_reset_inst;
+ params->progress_interval = progress_interval;
//
// Caches
diff --git a/src/cpu/o3/checker_builder.cc b/src/cpu/o3/checker_builder.cc
index 782d963b0..ad83ec57a 100644
--- a/src/cpu/o3/checker_builder.cc
+++ b/src/cpu/o3/checker_builder.cc
@@ -64,6 +64,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
Param<Counter> max_insts_all_threads;
Param<Counter> max_loads_any_thread;
Param<Counter> max_loads_all_threads;
+ Param<Counter> stats_reset_inst;
+ Param<Tick> progress_interval;
#if FULL_SYSTEM
SimObjectParam<AlphaITB *> itb;
@@ -78,6 +80,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
Param<bool> defer_registration;
Param<bool> exitOnError;
+ Param<bool> updateOnError;
Param<bool> warnOnlyOnLoadError;
Param<bool> function_trace;
Param<Tick> function_trace_start;
@@ -94,6 +97,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker)
"terminate when any thread reaches this load count"),
INIT_PARAM(max_loads_all_threads,
"terminate when all threads have reached this load count"),
+ INIT_PARAM(stats_reset_inst,
+ "blah"),
+ INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0),
#if FULL_SYSTEM
INIT_PARAM(itb, "Instruction TLB"),
@@ -109,6 +115,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker)
INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
INIT_PARAM(exitOnError, "exit on error"),
+ INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"),
INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load "
"result errors", false),
INIT_PARAM(function_trace, "Enable function trace"),
@@ -126,7 +133,9 @@ CREATE_SIM_OBJECT(O3Checker)
params->max_insts_all_threads = 0;
params->max_loads_any_thread = 0;
params->max_loads_all_threads = 0;
+ params->stats_reset_inst = 0;
params->exitOnError = exitOnError;
+ params->updateOnError = updateOnError;
params->warnOnlyOnLoadError = warnOnlyOnLoadError;
params->deferRegistration = defer_registration;
params->functionTrace = function_trace;
@@ -139,6 +148,10 @@ CREATE_SIM_OBJECT(O3Checker)
temp = max_insts_all_threads;
temp = max_loads_any_thread;
temp = max_loads_all_threads;
+ temp = stats_reset_inst;
+ Tick temp2 = progress_interval;
+ params->progress_interval = 0;
+ temp2++;
#if FULL_SYSTEM
params->itb = itb;
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 34f487e2c..6ae01ae67 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -1083,12 +1083,26 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
// Generate trap squash event.
generateTrapEvent(tid);
-
+// warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC());
return false;
}
updateComInstStats(head_inst);
+#if FULL_SYSTEM
+ if (thread[tid]->profile) {
+// bool usermode =
+// (cpu->readMiscReg(AlphaISA::IPR_DTB_CM, tid) & 0x18) != 0;
+// thread[tid]->profilePC = usermode ? 1 : head_inst->readPC();
+ thread[tid]->profilePC = head_inst->readPC();
+ ProfileNode *node = thread[tid]->profile->consume(thread[tid]->getXCProxy(),
+ head_inst->staticInst);
+
+ if (node)
+ thread[tid]->profileNode = node;
+ }
+#endif
+
if (head_inst->traceData) {
head_inst->traceData->setFetchSeq(head_inst->seqNum);
head_inst->traceData->setCPSeq(thread[tid]->numInst);
@@ -1102,6 +1116,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
head_inst->renamedDestRegIdx(i));
}
+ if (head_inst->isCopy())
+ panic("Should not commit any copy instructions!");
+
// Finally clear the head ROB entry.
rob->retireHead(tid);
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 19ab7f4c5..4279df6f7 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -33,6 +33,7 @@
#include "config/use_checker.hh"
#if FULL_SYSTEM
+#include "cpu/quiesce_event.hh"
#include "sim/system.hh"
#else
#include "sim/process.hh"
@@ -793,6 +794,8 @@ template <class Impl>
unsigned int
FullO3CPU<Impl>::drain(Event *drain_event)
{
+ DPRINTF(O3CPU, "Switching out\n");
+ BaseCPU::switchOut(_sampler);
drainCount = 0;
fetch.drain();
decode.drain();
@@ -863,6 +866,7 @@ FullO3CPU<Impl>::switchOut()
{
fetch.switchOut();
rename.switchOut();
+ iew.switchOut();
commit.switchOut();
instList.clear();
while (!removeList.empty()) {
@@ -931,6 +935,45 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
}
template <class Impl>
+void
+FullO3CPU<Impl>::serialize(std::ostream &os)
+{
+ BaseCPU::serialize(os);
+ nameOut(os, csprintf("%s.tickEvent", name()));
+ tickEvent.serialize(os);
+
+ // Use SimpleThread's ability to checkpoint to make it easier to
+ // write out the registers. Also make this static so it doesn't
+ // get instantiated multiple times (causes a panic in statistics).
+ static CPUExecContext temp;
+
+ for (int i = 0; i < thread.size(); i++) {
+ nameOut(os, csprintf("%s.xc.%i", name(), i));
+ temp.copyXC(thread[i]->getXCProxy());
+ temp.serialize(os);
+ }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
+{
+ BaseCPU::unserialize(cp, section);
+ tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+
+ // Use SimpleThread's ability to checkpoint to make it easier to
+ // read in the registers. Also make this static so it doesn't
+ // get instantiated multiple times (causes a panic in statistics).
+ static CPUExecContext temp;
+
+ for (int i = 0; i < thread.size(); i++) {
+ temp.copyXC(thread[i]->getXCProxy());
+ temp.unserialize(cp, csprintf("%s.xc.%i", section, i));
+ thread[i]->getXCProxy()->copyArchRegs(temp.getProxy());
+ }
+}
+
+template <class Impl>
uint64_t
FullO3CPU<Impl>::readIntReg(int reg_idx)
{
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 1e080181c..2d447bfe5 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -442,6 +442,7 @@ DefaultFetch<Impl>::takeOverFrom()
wroteToTimeBuffer = false;
_status = Inactive;
switchedOut = false;
+ interruptPending = false;
branchPred.takeOverFrom();
}
@@ -563,7 +564,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
unsigned flags = 0;
#endif // FULL_SYSTEM
- if (cacheBlocked || (interruptPending && flags == 0)) {
+ if (cacheBlocked || isSwitchedOut() || (interruptPending && flags == 0)) {
// Hold off fetch from getting new instructions when:
// Cache is blocked, or
// while an interrupt is pending and we're not in PAL mode, or
@@ -1152,8 +1153,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
fetch_PC = next_PC;
if (instruction->isQuiesce()) {
- warn("cycle %lli: Quiesce instruction encountered, halting fetch!",
- curTick);
+// warn("%lli: Quiesce instruction encountered, halting fetch!",
+// curTick);
fetchStatus[tid] = QuiescePending;
++numInst;
status_change = true;
@@ -1268,7 +1269,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
fetchStatus[tid] = TrapPending;
status_change = true;
- warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]);
+// warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
#else // !FULL_SYSTEM
warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]);
#endif // FULL_SYSTEM
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 76fa008ee..a400c9fa8 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -216,6 +216,7 @@ class DefaultIEW
if (++wbOutstanding == wbMax)
ableToIssue = false;
DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+ assert(wbOutstanding <= wbMax);
#ifdef DEBUG
wbList.insert(sn);
#endif
@@ -226,6 +227,7 @@ class DefaultIEW
if (wbOutstanding-- == wbMax)
ableToIssue = true;
DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+ assert(wbOutstanding >= 0);
#ifdef DEBUG
assert(wbList.find(sn) != wbList.end());
wbList.erase(sn);
@@ -450,7 +452,9 @@ class DefaultIEW
unsigned wbCycle;
/** Number of instructions in flight that will writeback. */
- unsigned wbOutstanding;
+
+ /** Number of instructions in flight that will writeback. */
+ int wbOutstanding;
/** Writeback width. */
unsigned wbWidth;
@@ -507,6 +511,8 @@ class DefaultIEW
Stats::Scalar<> iewExecutedInsts;
/** Stat for total number of executed load instructions. */
Stats::Vector<> iewExecLoadInsts;
+ /** Stat for total number of executed store instructions. */
+// Stats::Scalar<> iewExecStoreInsts;
/** Stat for total number of squashed instructions skipped at execute. */
Stats::Scalar<> iewExecSquashedInsts;
/** Number of executed software prefetches. */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index e9b24a6d4..c82f6dd21 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -162,17 +162,17 @@ DefaultIEW<Impl>::regStats()
branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
iewExecutedInsts
- .name(name() + ".EXEC:insts")
+ .name(name() + ".iewExecutedInsts")
.desc("Number of executed instructions");
iewExecLoadInsts
.init(cpu->number_of_threads)
- .name(name() + ".EXEC:loads")
+ .name(name() + ".iewExecLoadInsts")
.desc("Number of load instructions executed")
.flags(total);
iewExecSquashedInsts
- .name(name() + ".EXEC:squashedInsts")
+ .name(name() + ".iewExecSquashedInsts")
.desc("Number of squashed instructions skipped in execute");
iewExecutedSwp
@@ -372,6 +372,8 @@ DefaultIEW<Impl>::switchOut()
{
// Clear any state.
switchedOut = true;
+ assert(insts[0].empty());
+ assert(skidBuffer[0].empty());
instQueue.switchOut();
ldstQueue.switchOut();
@@ -410,7 +412,6 @@ DefaultIEW<Impl>::takeOverFrom()
updateLSQNextCycle = false;
- // @todo: Fix hardcoded number
for (int i = 0; i < issueToExecQueue.getSize(); ++i) {
issueToExecQueue.advance();
}
@@ -611,9 +612,11 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
wbNumInst = 0;
}
- assert((wbCycle * wbWidth + wbNumInst) < wbMax);
+ assert((wbCycle * wbWidth + wbNumInst) <= wbMax);
}
+ DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n",
+ wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst);
// Add finished instruction to queue to commit.
(*iewQueue)[wbCycle].insts[wbNumInst] = inst;
(*iewQueue)[wbCycle].size++;
@@ -903,6 +906,22 @@ DefaultIEW<Impl>::emptyRenameInsts(unsigned tid)
template <class Impl>
void
+DefaultIEW<Impl>::emptyRenameInsts(unsigned tid)
+{
+ while (!insts[tid].empty()) {
+ if (insts[tid].front()->isLoad() ||
+ insts[tid].front()->isStore() ) {
+ toRename->iewInfo[tid].dispatchedToLSQ++;
+ }
+
+ toRename->iewInfo[tid].dispatched++;
+
+ insts[tid].pop();
+ }
+}
+
+template <class Impl>
+void
DefaultIEW<Impl>::wakeCPU()
{
cpu->wakeCPU();
@@ -1273,13 +1292,23 @@ DefaultIEW<Impl>::executeInsts()
// event adds the instruction to the queue to commit
fault = ldstQueue.executeLoad(inst);
} else if (inst->isStore()) {
- ldstQueue.executeStore(inst);
+ fault = ldstQueue.executeStore(inst);
// If the store had a fault then it may not have a mem req
- if (inst->req && !(inst->req->getFlags() & LOCKED)) {
+ if (!inst->isStoreConditional() && fault == NoFault) {
+ inst->setExecuted();
+
+ instToCommit(inst);
+ } else if (fault != NoFault) {
+ // If the instruction faulted, then we need to send it along to commit
+ // without the instruction completing.
+
+ // Send this instruction to commit, also make sure iew stage
+ // realizes there is activity.
inst->setExecuted();
instToCommit(inst);
+ activityThisCycle();
}
// Store conditionals will mark themselves as
@@ -1404,7 +1433,7 @@ DefaultIEW<Impl>::writebackInsts()
// E.g. Uncached loads have not actually executed when they
// are first sent to commit. Instead commit must tell the LSQ
// when it's ready to execute the uncached load.
- if (!inst->isSquashed() && inst->isExecuted()) {
+ if (!inst->isSquashed() && inst->isExecuted() && inst->getFault() == NoFault) {
int dependents = instQueue.wakeDependents(inst);
for (int i = 0; i < inst->numDestRegs(); i++) {
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index d745faf7b..3dd4dc658 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -479,13 +479,13 @@ class InstructionQueue
/** Distribution of number of instructions in the queue.
* @todo: Need to create struct to track the entry time for each
* instruction. */
- Stats::VectorDistribution<> queueResDist;
+// Stats::VectorDistribution<> queueResDist;
/** Distribution of the number of instructions issued. */
Stats::Distribution<> numIssuedDist;
/** Distribution of the cycles it takes to issue an instruction.
* @todo: Need to create struct to track the ready time for each
* instruction. */
- Stats::VectorDistribution<> issueDelayDist;
+// Stats::VectorDistribution<> issueDelayDist;
/** Number of times an instruction could not be issued because a
* FU was busy.
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index 47634f645..6edb528a9 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -230,7 +230,7 @@ InstructionQueue<Impl>::regStats()
.name(name() + ".iqSquashedNonSpecRemoved")
.desc("Number of squashed non-spec instructions that were removed")
.prereq(iqSquashedNonSpecRemoved);
-
+/*
queueResDist
.init(Num_OpClasses, 0, 99, 2)
.name(name() + ".IQ:residence:")
@@ -240,6 +240,7 @@ InstructionQueue<Impl>::regStats()
for (int i = 0; i < Num_OpClasses; ++i) {
queueResDist.subname(i, opClassStrings[i]);
}
+*/
numIssuedDist
.init(0,totalWidth,1)
.name(name() + ".ISSUE:issued_per_cycle")
@@ -268,7 +269,7 @@ InstructionQueue<Impl>::regStats()
//
// How long did instructions for a particular FU type wait prior to issue
//
-
+/*
issueDelayDist
.init(Num_OpClasses,0,99,2)
.name(name() + ".ISSUE:")
@@ -281,7 +282,7 @@ InstructionQueue<Impl>::regStats()
subname << opClassStrings[i] << "_delay";
issueDelayDist.subname(i, subname.str());
}
-
+*/
issueRate
.name(name() + ".ISSUE:rate")
.desc("Inst issue rate")
@@ -385,8 +386,16 @@ template <class Impl>
void
InstructionQueue<Impl>::switchOut()
{
+/*
+ if (!instList[0].empty() || (numEntries != freeEntries) ||
+ !readyInsts[0].empty() || !nonSpecInsts.empty() || !listOrder.empty()) {
+ dumpInsts();
+// assert(0);
+ }
+*/
resetState();
dependGraph.reset();
+ instsToExecute.clear();
switchedOut = true;
for (int i = 0; i < numThreads; ++i) {
memDepUnit[i].switchOut();
@@ -642,9 +651,12 @@ template <class Impl>
void
InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
{
+ DPRINTF(IQ, "Processing FU completion [sn:%lli]\n", inst->seqNum);
// The CPU could have been sleeping until this op completed (*extremely*
// long latency op). Wake it if it was. This may be overkill.
if (isSwitchedOut()) {
+ DPRINTF(IQ, "FU completion not processed, IQ is switched out [sn:%lli]\n",
+ inst->seqNum);
return;
}
@@ -1036,6 +1048,10 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
(squashed_inst->isMemRef() &&
!squashed_inst->memOpDone)) {
+ DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+ "squashed.\n",
+ tid, squashed_inst->seqNum, squashed_inst->readPC());
+
// Remove the instruction from the dependency list.
if (!squashed_inst->isNonSpeculative() &&
!squashed_inst->isStoreConditional() &&
@@ -1066,7 +1082,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
++iqSquashedOperandsExamined;
}
- } else {
+ } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) {
NonSpecMapIt ns_inst_it =
nonSpecInsts.find(squashed_inst->seqNum);
assert(ns_inst_it != nonSpecInsts.end());
@@ -1093,10 +1109,6 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
count[squashed_inst->threadNumber]--;
++freeEntries;
-
- DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
- "squashed.\n",
- tid, squashed_inst->seqNum, squashed_inst->readPC());
}
instList[tid].erase(squash_it--);
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 2bbab71f0..a1ac5adb8 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -167,6 +167,16 @@ LSQ<Impl>::regStats()
template<class Impl>
void
+LSQ<Impl>::regStats()
+{
+ //Initialize LSQs
+ for (int tid=0; tid < numThreads; tid++) {
+ thread[tid].regStats();
+ }
+}
+
+template<class Impl>
+void
LSQ<Impl>::setActiveThreads(std::list<unsigned> *at_ptr)
{
activeThreads = at_ptr;
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 1358a3699..8537e9dd7 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -407,20 +407,9 @@ class LSQUnit {
// Will also need how many read/write ports the Dcache has. Or keep track
// of that in stage that is one level up, and only call executeLoad/Store
// the appropriate number of times.
-
/** Total number of loads forwaded from LSQ stores. */
Stats::Scalar<> lsqForwLoads;
- /** Total number of loads ignored due to invalid addresses. */
- Stats::Scalar<> invAddrLoads;
-
- /** Total number of squashed loads. */
- Stats::Scalar<> lsqSquashedLoads;
-
- /** Total number of responses from the memory system that are
- * ignored due to the instruction already being squashed. */
- Stats::Scalar<> lsqIgnoredResponses;
-
/** Total number of squashed stores. */
Stats::Scalar<> lsqSquashedStores;
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index fa716c712..2922b81bd 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -180,6 +180,10 @@ LSQUnit<Impl>::regStats()
.name(name() + ".ignoredResponses")
.desc("Number of memory responses ignored because the instruction is squashed");
+ lsqMemOrderViolation
+ .name(name() + ".memOrderViolation")
+ .desc("Number of memory ordering violations");
+
lsqSquashedStores
.name(name() + ".squashedStores")
.desc("Number of stores squashed");
@@ -220,8 +224,10 @@ void
LSQUnit<Impl>::switchOut()
{
switchedOut = true;
- for (int i = 0; i < loadQueue.size(); ++i)
+ for (int i = 0; i < loadQueue.size(); ++i) {
+ assert(!loadQueue[i]);
loadQueue[i] = NULL;
+ }
assert(storesToWB == 0);
}
@@ -408,6 +414,11 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
if (load_fault != NoFault) {
// Send this instruction to commit, also make sure iew stage
// realizes there is activity.
+ // Mark it as executed unless it is an uncached load that
+ // needs to hit the head of commit.
+ if (!(inst->req->flags & UNCACHEABLE) || inst->isAtCommit()) {
+ inst->setExecuted();
+ }
iewStage->instToCommit(inst);
iewStage->activityThisCycle();
}
@@ -467,6 +478,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
// A load incorrectly passed this store. Squash and refetch.
// For now return a fault to show that it was unsuccessful.
memDepViolator = loadQueue[load_idx];
+ ++lsqMemOrderViolation;
return genMachineCheckFault();
}
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 16f67a4e0..c649ca385 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -109,6 +109,9 @@ template <class MemDepPred, class Impl>
void
MemDepUnit<MemDepPred, Impl>::switchOut()
{
+ assert(instList[0].empty());
+ assert(instsToReplay.empty());
+ assert(memDepHash.empty());
// Clear any state.
for (int i = 0; i < Impl::MaxThreads; ++i) {
instList[i].clear();
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index ba26a01dd..177b9cb87 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -417,6 +417,8 @@ class DefaultRename
/** The maximum skid buffer size. */
unsigned skidBufferMax;
+ PhysRegIndex maxPhysicalRegs;
+
/** Enum to record the source of a structure full stall. Can come from
* either ROB, IQ, LSQ, and it is priortized in that order.
*/
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index 782c0fe5f..248d7deb6 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -41,7 +41,8 @@ DefaultRename<Impl>::DefaultRename(Params *params)
commitToRenameDelay(params->commitToRenameDelay),
renameWidth(params->renameWidth),
commitWidth(params->commitWidth),
- numThreads(params->numberOfThreads)
+ numThreads(params->numberOfThreads),
+ maxPhysicalRegs(params->numPhysIntRegs + params->numPhysFloatRegs)
{
_status = Inactive;
@@ -286,6 +287,11 @@ DefaultRename<Impl>::switchOut()
// Put the renamed physical register back on the free list.
freeList->addReg(hb_it->newPhysReg);
+ // Be sure to mark its register as ready if it's a misc register.
+ if (hb_it->newPhysReg >= maxPhysicalRegs) {
+ scoreboard->setReg(hb_it->newPhysReg);
+ }
+
historyBuffer[i].erase(hb_it++);
}
insts[i].clear();
@@ -889,6 +895,11 @@ DefaultRename<Impl>::doSquash(const InstSeqNum &squashed_seq_num, unsigned tid)
// Put the renamed physical register back on the free list.
freeList->addReg(hb_it->newPhysReg);
+ // Be sure to mark its register as ready if it's a misc register.
+ if (hb_it->newPhysReg >= maxPhysicalRegs) {
+ scoreboard->setReg(hb_it->newPhysReg);
+ }
+
historyBuffer[tid].erase(hb_it++);
++renameUndoneMaps;
diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh
index b6f2e14c0..0247deb52 100644
--- a/src/cpu/o3/thread_state.hh
+++ b/src/cpu/o3/thread_state.hh
@@ -31,8 +31,11 @@
#ifndef __CPU_O3_THREAD_STATE_HH__
#define __CPU_O3_THREAD_STATE_HH__
+#include "base/callback.hh"
+#include "base/output.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
+#include "sim/sim_exit.hh"
class Event;
class Process;
@@ -75,8 +78,22 @@ struct O3ThreadState : public ThreadState {
#if FULL_SYSTEM
O3ThreadState(O3CPU *_cpu, int _thread_num)
: ThreadState(-1, _thread_num),
- inSyscall(0), trapPending(0)
- { }
+ cpu(_cpu), inSyscall(0), trapPending(0)
+ {
+ if (cpu->params->profile) {
+ profile = new FunctionProfile(cpu->params->system->kernelSymtab);
+ Callback *cb =
+ new MakeCallback<O3ThreadState,
+ &O3ThreadState::dumpFuncProfile>(this);
+ registerExitCallback(cb);
+ }
+
+ // let's fill with a dummy node for now so we don't get a segfault
+ // on the first cycle when there's no node available.
+ static ProfileNode dummyNode;
+ profileNode = &dummyNode;
+ profilePC = 3;
+ }
#else
O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process, int _asid,
MemObject *mem)
@@ -95,6 +112,14 @@ struct O3ThreadState : public ThreadState {
/** Handles the syscall. */
void syscall(int64_t callnum) { process->syscall(callnum, tc); }
#endif
+
+#if FULL_SYSTEM
+ void dumpFuncProfile()
+ {
+ std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name()));
+ profile->dump(xcProxy, *os);
+ }
+#endif
};
#endif // __CPU_O3_THREAD_STATE_HH__
diff --git a/src/cpu/o3/tournament_pred.cc b/src/cpu/o3/tournament_pred.cc
index 7cf78dcb1..ffb941c77 100644
--- a/src/cpu/o3/tournament_pred.cc
+++ b/src/cpu/o3/tournament_pred.cc
@@ -62,6 +62,8 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize,
for (int i = 0; i < localPredictorSize; ++i)
localCtrs[i].setBits(localCtrBits);
+ localPredictorMask = floorPow2(localPredictorSize) - 1;
+
if (!isPowerOf2(localHistoryTableSize)) {
fatal("Invalid local history table size!\n");
}
@@ -158,7 +160,7 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history)
//Lookup in the local predictor to get its branch prediction
local_history_idx = calcLocHistIdx(branch_addr);
local_predictor_idx = localHistoryTable[local_history_idx]
- & localHistoryMask;
+ & localPredictorMask;
local_prediction = localCtrs[local_predictor_idx].read() > threshold;
//Lookup in the global predictor to get its branch prediction
@@ -176,7 +178,8 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history)
bp_history = (void *)history;
assert(globalHistory < globalPredictorSize &&
- local_history_idx < localPredictorSize);
+ local_history_idx < localHistoryTableSize &&
+ local_predictor_idx < localPredictorSize);
// Commented code is for doing speculative update of counters and
// all histories.
@@ -234,7 +237,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history)
// Get the local predictor's current prediction
local_history_idx = calcLocHistIdx(branch_addr);
local_predictor_hist = localHistoryTable[local_history_idx];
- local_predictor_idx = local_predictor_hist & localHistoryMask;
+ local_predictor_idx = local_predictor_hist & localPredictorMask;
// Update the choice predictor to tell it which one was correct if
// there was a prediction.
@@ -256,6 +259,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history)
}
assert(globalHistory < globalPredictorSize &&
+ local_history_idx < localHistoryTableSize &&
local_predictor_idx < localPredictorSize);
// Update the counters and local history with the proper
diff --git a/src/cpu/o3/tournament_pred.hh b/src/cpu/o3/tournament_pred.hh
index 66b4aaae2..472944910 100644
--- a/src/cpu/o3/tournament_pred.hh
+++ b/src/cpu/o3/tournament_pred.hh
@@ -159,6 +159,9 @@ class TournamentBP
/** Size of the local predictor. */
unsigned localPredictorSize;
+ /** Mask to get the proper index bits into the predictor. */
+ unsigned localPredictorMask;
+
/** Number of bits of the local predictor's counters. */
unsigned localCtrBits;
diff --git a/src/cpu/ozone/checker_builder.cc b/src/cpu/ozone/checker_builder.cc
index c372e51d6..99ba3e308 100644
--- a/src/cpu/ozone/checker_builder.cc
+++ b/src/cpu/ozone/checker_builder.cc
@@ -65,6 +65,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
Param<Counter> max_insts_all_threads;
Param<Counter> max_loads_any_thread;
Param<Counter> max_loads_all_threads;
+ Param<Counter> stats_reset_inst;
+ Param<Tick> progress_interval;
#if FULL_SYSTEM
SimObjectParam<AlphaITB *> itb;
@@ -79,6 +81,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
Param<bool> defer_registration;
Param<bool> exitOnError;
+ Param<bool> updateOnError;
Param<bool> warnOnlyOnLoadError;
Param<bool> function_trace;
Param<Tick> function_trace_start;
@@ -95,6 +98,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
"terminate when any thread reaches this load count"),
INIT_PARAM(max_loads_all_threads,
"terminate when all threads have reached this load count"),
+ INIT_PARAM(stats_reset_inst,
+ "blah"),
+ INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0),
#if FULL_SYSTEM
INIT_PARAM(itb, "Instruction TLB"),
@@ -110,6 +116,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
INIT_PARAM(exitOnError, "exit on error"),
+ INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"),
INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load "
"result errors", false),
INIT_PARAM(function_trace, "Enable function trace"),
@@ -127,7 +134,9 @@ CREATE_SIM_OBJECT(OzoneChecker)
params->max_insts_all_threads = 0;
params->max_loads_any_thread = 0;
params->max_loads_all_threads = 0;
+ params->stats_reset_inst = 0;
params->exitOnError = exitOnError;
+ params->updateOnError = updateOnError;
params->warnOnlyOnLoadError = warnOnlyOnLoadError;
params->deferRegistration = defer_registration;
params->functionTrace = function_trace;
@@ -140,6 +149,10 @@ CREATE_SIM_OBJECT(OzoneChecker)
temp = max_insts_all_threads;
temp = max_loads_any_thread;
temp = max_loads_all_threads;
+ temp = stats_reset_inst;
+ Tick temp2 = progress_interval;
+ temp2++;
+ params->progress_interval = 0;
#if FULL_SYSTEM
params->itb = itb;
diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh
index e411c12bd..ece68282f 100644
--- a/src/cpu/ozone/cpu.hh
+++ b/src/cpu/ozone/cpu.hh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -81,13 +81,13 @@ template <class>
class Checker;
/**
- * Declaration of Out-of-Order CPU class. Basically it is a SimpleCPU with
- * simple out-of-order capabilities added to it. It is still a 1 CPI machine
- * (?), but is capable of handling cache misses. Basically it models having
- * a ROB/IQ by only allowing a certain amount of instructions to execute while
- * the cache miss is outstanding.
+ * Light weight out of order CPU model that approximates an out of
+ * order CPU. It is separated into a front end and a back end, with
+ * the template parameter Impl describing the classes used for each.
+ * The goal is to be able to specify through the Impl the class to use
+ * for the front end and back end, with different classes used to
+ * model different levels of detail.
*/
-
template <class Impl>
class OzoneCPU : public BaseCPU
{
@@ -273,6 +273,7 @@ class OzoneCPU : public BaseCPU
typedef OzoneThreadState<Impl> ImplState;
private:
+ // Committed thread state for the OzoneCPU.
OzoneThreadState<Impl> thread;
public:
@@ -310,12 +311,6 @@ class OzoneCPU : public BaseCPU
tickEvent.squash();
}
- private:
- Trace::InstRecord *traceData;
-
- template<typename T>
- void trace_data(T data);
-
public:
enum Status {
Running,
@@ -326,8 +321,6 @@ class OzoneCPU : public BaseCPU
Status _status;
public:
- bool checkInterrupts;
-
void post_interrupt(int int_num, int index);
void zero_fill_64(Addr addr) {
@@ -379,6 +372,7 @@ class OzoneCPU : public BaseCPU
FrontEnd *frontEnd;
BackEnd *backEnd;
+
private:
Status status() const { return _status; }
void setStatus(Status new_status) { _status = new_status; }
@@ -410,12 +404,11 @@ class OzoneCPU : public BaseCPU
// number of idle cycles
Stats::Average<> notIdleFraction;
Stats::Formula idleFraction;
- public:
+ public:
virtual void serialize(std::ostream &os);
virtual void unserialize(Checkpoint *cp, const std::string &section);
-
#if FULL_SYSTEM
/** Translates instruction requestion. */
Fault translateInstReq(RequestPtr &req, OzoneThreadState<Impl> *thread)
@@ -582,12 +575,9 @@ class OzoneCPU : public BaseCPU
Fault copy(Addr dest);
- InstSeqNum globalSeqNum;
-
public:
void squashFromTC();
- // @todo: This can be a useful debug function. Implement it.
void dumpInsts() { frontEnd->dumpInsts(); }
#if FULL_SYSTEM
@@ -605,7 +595,6 @@ class OzoneCPU : public BaseCPU
ThreadContext *tcBase() { return tc; }
- bool decoupledFrontEnd;
struct CommStruct {
InstSeqNum doneSeqNum;
InstSeqNum nonSpecSeqNum;
@@ -614,8 +603,13 @@ class OzoneCPU : public BaseCPU
bool stall;
};
+
+ InstSeqNum globalSeqNum;
+
TimeBuffer<CommStruct> comm;
+ bool decoupledFrontEnd;
+
bool lockFlag;
Stats::Scalar<> quiesceCycles;
diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc
index e239b7a94..e3e4ec433 100644
--- a/src/cpu/ozone/cpu_builder.cc
+++ b/src/cpu/ozone/cpu_builder.cc
@@ -63,6 +63,7 @@ SimObjectParam<System *> system;
Param<int> cpu_id;
SimObjectParam<AlphaITB *> itb;
SimObjectParam<AlphaDTB *> dtb;
+Param<Tick> profile;
#else
SimObjectVectorParam<Process *> workload;
//SimObjectParam<PageTable *> page_table;
@@ -76,16 +77,19 @@ Param<Counter> max_insts_any_thread;
Param<Counter> max_insts_all_threads;
Param<Counter> max_loads_any_thread;
Param<Counter> max_loads_all_threads;
+Param<Counter> stats_reset_inst;
+Param<Tick> progress_interval;
//SimObjectParam<BaseCache *> icache;
//SimObjectParam<BaseCache *> dcache;
Param<unsigned> cachePorts;
Param<unsigned> width;
+Param<unsigned> frontEndLatency;
Param<unsigned> frontEndWidth;
+Param<unsigned> backEndLatency;
Param<unsigned> backEndWidth;
Param<unsigned> backEndSquashLatency;
-Param<unsigned> backEndLatency;
Param<unsigned> maxInstBufferSize;
Param<unsigned> numPhysicalRegs;
Param<unsigned> maxOutstandingMemOps;
@@ -140,6 +144,7 @@ Param<unsigned> RASSize;
Param<unsigned> LQEntries;
Param<unsigned> SQEntries;
+Param<bool> lsqLimits;
Param<unsigned> LFSTSize;
Param<unsigned> SSITSize;
@@ -181,6 +186,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
INIT_PARAM(cpu_id, "processor ID"),
INIT_PARAM(itb, "Instruction translation buffer"),
INIT_PARAM(dtb, "Data translation buffer"),
+ INIT_PARAM(profile, ""),
#else
INIT_PARAM(workload, "Processes to run"),
// INIT_PARAM(page_table, "Page table"),
@@ -204,16 +210,21 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
"Terminate when all threads have reached this load"
"count",
0),
+ INIT_PARAM_DFLT(stats_reset_inst,
+ "blah",
+ 0),
+ INIT_PARAM_DFLT(progress_interval, "Progress interval", 0),
// INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
// INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
INIT_PARAM_DFLT(width, "Width", 1),
+ INIT_PARAM_DFLT(frontEndLatency, "Front end latency", 1),
INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1),
+ INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
INIT_PARAM_DFLT(backEndWidth, "Back end width", 1),
INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1),
- INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16),
INIT_PARAM(numPhysicalRegs, "Number of physical registers"),
INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4),
@@ -274,6 +285,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
INIT_PARAM(LQEntries, "Number of load queue entries"),
INIT_PARAM(SQEntries, "Number of store queue entries"),
+ INIT_PARAM_DFLT(lsqLimits, "LSQ size limits dispatch", true),
INIT_PARAM(LFSTSize, "Last fetched store table size"),
INIT_PARAM(SSITSize, "Store set ID table size"),
@@ -336,6 +348,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
params->cpu_id = cpu_id;
params->itb = itb;
params->dtb = dtb;
+ params->profile = profile;
#else
params->workload = workload;
// params->pTable = page_table;
@@ -347,6 +360,8 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
params->max_insts_all_threads = max_insts_all_threads;
params->max_loads_any_thread = max_loads_any_thread;
params->max_loads_all_threads = max_loads_all_threads;
+ params->stats_reset_inst = stats_reset_inst;
+ params->progress_interval = progress_interval;
//
// Caches
@@ -357,6 +372,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
params->width = width;
params->frontEndWidth = frontEndWidth;
+ params->frontEndLatency = frontEndLatency;
params->backEndWidth = backEndWidth;
params->backEndSquashLatency = backEndSquashLatency;
params->backEndLatency = backEndLatency;
@@ -414,6 +430,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
params->LQEntries = LQEntries;
params->SQEntries = SQEntries;
+ params->lsqLimits = lsqLimits;
params->SSITSize = SSITSize;
params->LFSTSize = LFSTSize;
diff --git a/src/cpu/ozone/cpu_impl.hh b/src/cpu/ozone/cpu_impl.hh
index 80f18434c..5c8b5001d 100644
--- a/src/cpu/ozone/cpu_impl.hh
+++ b/src/cpu/ozone/cpu_impl.hh
@@ -50,7 +50,6 @@
#include "arch/alpha/types.hh"
#include "arch/vtophys.hh"
#include "base/callback.hh"
-//#include "base/remote_gdb.hh"
#include "cpu/profile.hh"
#include "kern/kernel_stats.hh"
#include "sim/faults.hh"
@@ -68,15 +67,6 @@
using namespace TheISA;
template <class Impl>
-template<typename T>
-void
-OzoneCPU<Impl>::trace_data(T data) {
- if (traceData) {
- traceData->setData(data);
- }
-}
-
-template <class Impl>
OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w)
: Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w)
{
@@ -112,7 +102,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
_status = Idle;
if (p->checker) {
-#if USE_CHECKER
+
BaseCPU *temp_checker = p->checker;
checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
checker->setMemory(mem);
@@ -126,6 +116,8 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
panic("Checker enabled but not compiled in!");
#endif
} else {
+ // If checker is not being used, then the xcProxy points
+ // directly to the CPU's ExecContext.
checker = NULL;
thread.tc = &ozoneTC;
tc = &ozoneTC;
@@ -138,7 +130,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
thread.setStatus(ThreadContext::Suspended);
#if FULL_SYSTEM
- /***** All thread state stuff *****/
+ // Setup thread state stuff.
thread.cpu = this;
thread.setTid(0);
@@ -187,12 +179,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
frontEnd->setBackEnd(backEnd);
backEnd->setFrontEnd(frontEnd);
- decoupledFrontEnd = p->decoupledFrontEnd;
-
globalSeqNum = 1;
+#if FULL_SYSTEM
checkInterrupts = false;
+#endif
+
+ lockFlag = 0;
+ // Setup rename table, initializing all values to ready.
for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
thread.renameTable[i] = new DynInst(this);
thread.renameTable[i]->setResultReady();
@@ -233,8 +228,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
thread.setVirtPort(virt_port);
#endif
- lockFlag = 0;
-
DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n");
}
@@ -247,6 +240,7 @@ template <class Impl>
void
OzoneCPU<Impl>::switchOut()
{
+ BaseCPU::switchOut(_sampler);
switchCount = 0;
// Front end needs state from back end, so switch out the back end first.
backEnd->switchOut();
@@ -257,6 +251,8 @@ template <class Impl>
void
OzoneCPU<Impl>::signalSwitched()
{
+ // Only complete the switchout when both the front end and back
+ // end have signalled they are ready to switch.
if (++switchCount == 2) {
backEnd->doSwitchOut();
frontEnd->doSwitchOut();
@@ -266,6 +262,17 @@ OzoneCPU<Impl>::signalSwitched()
#endif
_status = SwitchedOut;
+#ifndef NDEBUG
+ // Loop through all registers
+ for (int i = 0; i < AlphaISA::TotalNumRegs; ++i) {
+ assert(thread.renameTable[i] == frontEnd->renameTable[i]);
+
+ assert(thread.renameTable[i] == backEnd->renameTable[i]);
+
+ DPRINTF(OzoneCPU, "Checking if register %i matches.\n", i);
+ }
+#endif
+
if (tickEvent.scheduled())
tickEvent.squash();
}
@@ -278,13 +285,25 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
{
BaseCPU::takeOverFrom(oldCPU);
+ thread.trapPending = false;
+ thread.inSyscall = false;
+
backEnd->takeOverFrom();
frontEnd->takeOverFrom();
+ frontEnd->renameTable.copyFrom(thread.renameTable);
+ backEnd->renameTable.copyFrom(thread.renameTable);
assert(!tickEvent.scheduled());
+#ifndef NDEBUG
+ // Check rename table.
+ for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+ assert(thread.renameTable[i]->isResultReady());
+ }
+#endif
+
// @todo: Fix hardcoded number
// Clear out any old information in time buffer.
- for (int i = 0; i < 6; ++i) {
+ for (int i = 0; i < 15; ++i) {
comm.advance();
}
@@ -316,6 +335,10 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay)
notIdleFraction++;
scheduleTickEvent(delay);
_status = Running;
+#if FULL_SYSTEM
+ if (thread.quiesceEvent && thread.quiesceEvent->scheduled())
+ thread.quiesceEvent->deschedule();
+#endif
thread.setStatus(ThreadContext::Active);
frontEnd->wakeFromQuiesce();
}
@@ -393,7 +416,7 @@ template <class Impl>
void
OzoneCPU<Impl>::resetStats()
{
- startNumInst = numInst;
+// startNumInst = numInst;
notIdleFraction = (_status != Idle);
}
@@ -441,6 +464,15 @@ OzoneCPU<Impl>::serialize(std::ostream &os)
ozoneTC.serialize(os);
nameOut(os, csprintf("%s.tickEvent", name()));
tickEvent.serialize(os);
+
+ // Use SimpleThread's ability to checkpoint to make it easier to
+ // write out the registers. Also make this static so it doesn't
+ // get instantiated multiple times (causes a panic in statistics).
+ static CPUExecContext temp;
+
+ nameOut(os, csprintf("%s.xc.0", name()));
+ temp.copyXC(thread.getXCProxy());
+ temp.serialize(os);
}
template <class Impl>
@@ -451,6 +483,15 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
UNSERIALIZE_ENUM(_status);
ozoneTC.unserialize(cp, csprintf("%s.tc", section));
tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+
+ // Use SimpleThread's ability to checkpoint to make it easier to
+ // read in the registers. Also make this static so it doesn't
+ // get instantiated multiple times (causes a panic in statistics).
+ static CPUExecContext temp;
+
+ temp.copyXC(thread.getXCProxy());
+ temp.unserialize(cp, csprintf("%s.xc.0", section));
+ thread.getXCProxy()->copyArchRegs(temp.getProxy());
}
template <class Impl>
@@ -810,7 +851,9 @@ OzoneCPU<Impl>::OzoneTC::halt()
template <class Impl>
void
OzoneCPU<Impl>::OzoneTC::dumpFuncProfile()
-{ }
+{
+ thread->dumpFuncProfile();
+}
#endif
template <class Impl>
@@ -829,6 +872,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context)
copyArchRegs(old_context);
setCpuId(old_context->readCpuId());
+ thread->inst = old_context->getInst();
#if !FULL_SYSTEM
setFuncExeInst(old_context->readFuncExeInst());
#else
@@ -842,6 +886,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context)
thread->quiesceEvent->tc = this;
}
+ // Copy kernel stats pointer from old context.
thread->kernelStats = old_context->getKernelStats();
// storeCondFailures = 0;
cpu->lockFlag = false;
@@ -863,7 +908,11 @@ OzoneCPU<Impl>::OzoneTC::regStats(const std::string &name)
template <class Impl>
void
OzoneCPU<Impl>::OzoneTC::serialize(std::ostream &os)
-{ }
+{
+ // Once serialization is added, serialize the quiesce event and
+ // kernel stats. Will need to make sure there aren't multiple
+ // things that serialize them.
+}
template <class Impl>
void
@@ -896,16 +945,14 @@ template <class Impl>
void
OzoneCPU<Impl>::OzoneTC::profileClear()
{
- if (thread->profile)
- thread->profile->clear();
+ thread->profileClear();
}
template <class Impl>
void
OzoneCPU<Impl>::OzoneTC::profileSample()
{
- if (thread->profile)
- thread->profile->sample(thread->profileNode, thread->profilePC);
+ thread->profileSample();
}
#endif
@@ -916,7 +963,6 @@ OzoneCPU<Impl>::OzoneTC::getThreadNum()
return thread->readTid();
}
-// Also somewhat obnoxious. Really only used for the TLB fault.
template <class Impl>
TheISA::MachInst
OzoneCPU<Impl>::OzoneTC::getInst()
@@ -934,14 +980,20 @@ OzoneCPU<Impl>::OzoneTC::copyArchRegs(ThreadContext *tc)
cpu->frontEnd->setPC(thread->PC);
cpu->frontEnd->setNextPC(thread->nextPC);
- for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
- if (i < TheISA::FP_Base_DepTag) {
- thread->renameTable[i]->setIntResult(tc->readIntReg(i));
- } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) {
- int fp_idx = i - TheISA::FP_Base_DepTag;
- thread->renameTable[i]->setDoubleResult(
- tc->readFloatReg(fp_idx, 64));
- }
+ // First loop through the integer registers.
+ for (int i = 0; i < TheISA::NumIntRegs; ++i) {
+/* DPRINTF(OzoneCPU, "Copying over register %i, had data %lli, "
+ "now has data %lli.\n",
+ i, thread->renameTable[i]->readIntResult(),
+ tc->readIntReg(i));
+*/
+ thread->renameTable[i]->setIntResult(tc->readIntReg(i));
+ }
+
+ // Then loop through the floating point registers.
+ for (int i = 0; i < TheISA::NumFloatRegs; ++i) {
+ int fp_idx = i + TheISA::FP_Base_DepTag;
+ thread->renameTable[fp_idx]->setIntResult(tc->readFloatRegBits(i));
}
#if !FULL_SYSTEM
diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh
index 3ed3c4d18..5ffd3666e 100644
--- a/src/cpu/ozone/front_end.hh
+++ b/src/cpu/ozone/front_end.hh
@@ -34,6 +34,7 @@
#include <deque>
#include "arch/utility.hh"
+#include "base/timebuf.hh"
#include "cpu/inst_seq.hh"
#include "cpu/o3/bpred_unit.hh"
#include "cpu/ozone/rename_table.hh"
@@ -246,15 +247,21 @@ class FrontEnd
void dumpInsts();
private:
+ TimeBuffer<int> numInstsReady;
+
typedef typename std::deque<DynInstPtr> InstBuff;
typedef typename InstBuff::iterator InstBuffIt;
+ InstBuff feBuffer;
+
InstBuff instBuffer;
int instBufferSize;
int maxInstBufferSize;
+ int latency;
+
int width;
int freeRegs;
diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh
index 1b120460a..d34716de6 100644
--- a/src/cpu/ozone/front_end_impl.hh
+++ b/src/cpu/ozone/front_end_impl.hh
@@ -92,8 +92,10 @@ FrontEnd<Impl>::FrontEnd(Params *params)
: branchPred(params),
icachePort(this),
mem(params->mem),
+ numInstsReady(params->frontEndLatency, 0),
instBufferSize(0),
maxInstBufferSize(params->maxInstBufferSize),
+ latency(params->frontEndLatency),
width(params->frontEndWidth),
freeRegs(params->numPhysicalRegs),
numPhysRegs(params->numPhysicalRegs),
@@ -326,6 +328,18 @@ FrontEnd<Impl>::tick()
if (switchedOut)
return;
+ for (int insts_to_queue = numInstsReady[-latency];
+ !instBuffer.empty() && insts_to_queue;
+ --insts_to_queue)
+ {
+ DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n",
+ instBuffer.front()->seqNum);
+ feBuffer.push_back(instBuffer.front());
+ instBuffer.pop_front();
+ }
+
+ numInstsReady.advance();
+
// @todo: Maybe I want to just have direct communication...
if (fromCommit->doneSeqNum) {
branchPred.update(fromCommit->doneSeqNum, 0);
@@ -339,8 +353,8 @@ FrontEnd<Impl>::tick()
cacheBlkValid = true;
status = Running;
- if (barrierInst)
- status = SerializeBlocked;
+// if (barrierInst)
+// status = SerializeBlocked;
if (freeRegs <= 0)
status = RenameBlocked;
checkBE();
@@ -414,11 +428,12 @@ FrontEnd<Impl>::tick()
// latency
instBuffer.push_back(inst);
++instBufferSize;
+ numInstsReady[0]++;
++num_inst;
#if FULL_SYSTEM
if (inst->isQuiesce()) {
- warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+// warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
status = QuiescePending;
break;
}
@@ -572,10 +587,10 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
// Change status over to SerializeBlocked so that other stages know
// what this is blocked on.
- status = SerializeBlocked;
+// status = SerializeBlocked;
- barrierInst = inst;
- return true;
+// barrierInst = inst;
+// return true;
} else if ((inst->isStoreConditional() || inst->isSerializeAfter())
&& !inst->isSerializeHandled()) {
DPRINTF(FE, "Serialize after instruction encountered.\n");
@@ -620,6 +635,7 @@ FrontEnd<Impl>::handleFault(Fault &fault)
instruction->fault = fault;
instruction->setCanIssue();
instBuffer.push_back(instruction);
+ numInstsReady[0]++;
++instBufferSize;
}
@@ -649,6 +665,21 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
freeRegs+= inst->numDestRegs();
}
+ while (!feBuffer.empty() &&
+ feBuffer.back()->seqNum > squash_num) {
+ DynInstPtr inst = feBuffer.back();
+
+ DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n",
+ inst->seqNum, inst->readPC());
+
+ inst->clearDependents();
+
+ feBuffer.pop_back();
+ --instBufferSize;
+
+ freeRegs+= inst->numDestRegs();
+ }
+
// Copy over rename table from the back end.
renameTable.copyFrom(backEnd->renameTable);
@@ -666,12 +697,12 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
DPRINTF(FE, "Squashing outstanding Icache access.\n");
memReq = NULL;
}
-
+/*
if (status == SerializeBlocked) {
assert(barrierInst->seqNum > squash_num);
barrierInst = NULL;
}
-
+*/
// Unless this squash originated from the front end, we're probably
// in running mode now.
// Actually might want to make this latency dependent.
@@ -683,13 +714,22 @@ template <class Impl>
typename Impl::DynInstPtr
FrontEnd<Impl>::getInst()
{
- if (instBufferSize == 0) {
+ if (feBuffer.empty()) {
return NULL;
}
- DynInstPtr inst = instBuffer.front();
+ DynInstPtr inst = feBuffer.front();
- instBuffer.pop_front();
+ if (inst->isSerializeBefore() || inst->isIprAccess()) {
+ DPRINTF(FE, "Back end is getting a serialize before inst\n");
+ if (!backEnd->robEmpty()) {
+ DPRINTF(FE, "Rob is not empty yet, not returning inst\n");
+ return NULL;
+ }
+ inst->clearSerializeBefore();
+ }
+
+ feBuffer.pop_front();
--instBufferSize;
@@ -784,11 +824,11 @@ FrontEnd<Impl>::updateStatus()
}
if (status == BEBlocked && !be_block) {
- if (barrierInst) {
- status = SerializeBlocked;
- } else {
+// if (barrierInst) {
+// status = SerializeBlocked;
+// } else {
status = Running;
- }
+// }
ret_val = true;
}
return ret_val;
@@ -810,6 +850,7 @@ template <class Impl>
typename Impl::DynInstPtr
FrontEnd<Impl>::getInstFromCacheline()
{
+/*
if (status == SerializeComplete) {
DynInstPtr inst = barrierInst;
status = Running;
@@ -817,7 +858,7 @@ FrontEnd<Impl>::getInstFromCacheline()
inst->clearSerializeBefore();
return inst;
}
-
+*/
InstSeqNum inst_seq;
MachInst inst;
// @todo: Fix this magic number used here to handle word offset (and
@@ -932,6 +973,7 @@ FrontEnd<Impl>::doSwitchOut()
squash(0, 0);
instBuffer.clear();
instBufferSize = 0;
+ feBuffer.clear();
status = Idle;
}
diff --git a/src/cpu/ozone/inorder_back_end_impl.hh b/src/cpu/ozone/inorder_back_end_impl.hh
index 701fc0ee9..16ebac163 100644
--- a/src/cpu/ozone/inorder_back_end_impl.hh
+++ b/src/cpu/ozone/inorder_back_end_impl.hh
@@ -284,7 +284,7 @@ InorderBackEnd<Impl>::executeInsts()
}
inst->setExecuted();
- inst->setCompleted();
+ inst->setResultReady();
inst->setCanCommit();
instList.pop_front();
diff --git a/src/cpu/ozone/inst_queue_impl.hh b/src/cpu/ozone/inst_queue_impl.hh
index f2d80e621..32a940241 100644
--- a/src/cpu/ozone/inst_queue_impl.hh
+++ b/src/cpu/ozone/inst_queue_impl.hh
@@ -850,13 +850,13 @@ template <class Impl>
void
InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
{
- OpClass op_class = ready_inst->opClass();
+// OpClass op_class = ready_inst->opClass();
readyInsts.push(ready_inst);
DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
"the ready list, PC %#x opclass:%i [sn:%lli].\n",
- ready_inst->readPC(), op_class, ready_inst->seqNum);
+ ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum);
}
/*
template <class Impl>
@@ -1177,11 +1177,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst)
return;
}
- OpClass op_class = inst->opClass();
+// OpClass op_class = inst->opClass();
DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
"the ready list, PC %#x opclass:%i [sn:%lli].\n",
- inst->readPC(), op_class, inst->seqNum);
+ inst->readPC(), inst->opClass(), inst->seqNum);
readyInsts.push(inst);
}
diff --git a/src/cpu/ozone/lw_back_end.hh b/src/cpu/ozone/lw_back_end.hh
index d836ceebd..49c6a1ae2 100644
--- a/src/cpu/ozone/lw_back_end.hh
+++ b/src/cpu/ozone/lw_back_end.hh
@@ -80,7 +80,7 @@ class LWBackEnd
TimeBuffer<IssueToExec> i2e;
typename TimeBuffer<IssueToExec>::wire instsToExecute;
TimeBuffer<ExecToCommit> e2c;
- TimeBuffer<Writeback> numInstsToWB;
+ TimeBuffer<int> numInstsToWB;
TimeBuffer<CommStruct> *comm;
typename TimeBuffer<CommStruct>::wire toIEW;
@@ -139,7 +139,7 @@ class LWBackEnd
Tick lastCommitCycle;
- bool robEmpty() { return instList.empty(); }
+ bool robEmpty() { return numInsts == 0; }
bool isFull() { return numInsts >= numROBEntries; }
bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; }
@@ -194,6 +194,7 @@ class LWBackEnd
}
void instToCommit(DynInstPtr &inst);
+ void readyInstsForCommit();
void switchOut();
void doSwitchOut();
@@ -255,12 +256,13 @@ class LWBackEnd
RenameTable<Impl> renameTable;
private:
+ int latency;
+
// General back end width. Used if the more specific isn't given.
int width;
// Dispatch width.
int dispatchWidth;
- int numDispatchEntries;
int dispatchSize;
int waitingInsts;
@@ -285,6 +287,7 @@ class LWBackEnd
int numROBEntries;
int numInsts;
+ bool lsqLimits;
std::set<InstSeqNum> waitingMemOps;
typedef std::set<InstSeqNum>::iterator MemIt;
@@ -295,9 +298,6 @@ class LWBackEnd
InstSeqNum squashSeqNum;
Addr squashNextPC;
- Fault faultFromFetch;
- bool fetchHasFault;
-
bool switchedOut;
bool switchPending;
@@ -321,8 +321,6 @@ class LWBackEnd
std::list<DynInstPtr> replayList;
std::list<DynInstPtr> writeback;
- int latency;
-
int squashLatency;
bool exactFullStall;
@@ -331,37 +329,39 @@ class LWBackEnd
/* Stats::Scalar<> dcacheStallCycles;
Counter lastDcacheStall;
*/
- Stats::Vector<> rob_cap_events;
- Stats::Vector<> rob_cap_inst_count;
- Stats::Vector<> iq_cap_events;
- Stats::Vector<> iq_cap_inst_count;
+ Stats::Vector<> robCapEvents;
+ Stats::Vector<> robCapInstCount;
+ Stats::Vector<> iqCapEvents;
+ Stats::Vector<> iqCapInstCount;
// total number of instructions executed
- Stats::Vector<> exe_inst;
- Stats::Vector<> exe_swp;
- Stats::Vector<> exe_nop;
- Stats::Vector<> exe_refs;
- Stats::Vector<> exe_loads;
- Stats::Vector<> exe_branches;
+ Stats::Vector<> exeInst;
+ Stats::Vector<> exeSwp;
+ Stats::Vector<> exeNop;
+ Stats::Vector<> exeRefs;
+ Stats::Vector<> exeLoads;
+ Stats::Vector<> exeBranches;
- Stats::Vector<> issued_ops;
+ Stats::Vector<> issuedOps;
// total number of loads forwaded from LSQ stores
- Stats::Vector<> lsq_forw_loads;
+ Stats::Vector<> lsqForwLoads;
// total number of loads ignored due to invalid addresses
- Stats::Vector<> inv_addr_loads;
+ Stats::Vector<> invAddrLoads;
// total number of software prefetches ignored due to invalid addresses
- Stats::Vector<> inv_addr_swpfs;
+ Stats::Vector<> invAddrSwpfs;
// ready loads blocked due to memory disambiguation
- Stats::Vector<> lsq_blocked_loads;
+ Stats::Vector<> lsqBlockedLoads;
Stats::Scalar<> lsqInversion;
- Stats::Vector<> n_issued_dist;
- Stats::VectorDistribution<> issue_delay_dist;
+ Stats::Vector<> nIssuedDist;
+/*
+ Stats::VectorDistribution<> issueDelayDist;
- Stats::VectorDistribution<> queue_res_dist;
+ Stats::VectorDistribution<> queueResDist;
+*/
/*
Stats::Vector<> stat_fu_busy;
Stats::Vector2d<> stat_fuBusy;
@@ -379,37 +379,37 @@ class LWBackEnd
Stats::Formula commit_ipb;
Stats::Formula lsq_inv_rate;
*/
- Stats::Vector<> writeback_count;
- Stats::Vector<> producer_inst;
- Stats::Vector<> consumer_inst;
- Stats::Vector<> wb_penalized;
+ Stats::Vector<> writebackCount;
+ Stats::Vector<> producerInst;
+ Stats::Vector<> consumerInst;
+ Stats::Vector<> wbPenalized;
- Stats::Formula wb_rate;
- Stats::Formula wb_fanout;
- Stats::Formula wb_penalized_rate;
+ Stats::Formula wbRate;
+ Stats::Formula wbFanout;
+ Stats::Formula wbPenalizedRate;
// total number of instructions committed
- Stats::Vector<> stat_com_inst;
- Stats::Vector<> stat_com_swp;
- Stats::Vector<> stat_com_refs;
- Stats::Vector<> stat_com_loads;
- Stats::Vector<> stat_com_membars;
- Stats::Vector<> stat_com_branches;
+ Stats::Vector<> statComInst;
+ Stats::Vector<> statComSwp;
+ Stats::Vector<> statComRefs;
+ Stats::Vector<> statComLoads;
+ Stats::Vector<> statComMembars;
+ Stats::Vector<> statComBranches;
- Stats::Distribution<> n_committed_dist;
+ Stats::Distribution<> nCommittedDist;
- Stats::Scalar<> commit_eligible_samples;
- Stats::Vector<> commit_eligible;
+ Stats::Scalar<> commitEligibleSamples;
+ Stats::Vector<> commitEligible;
Stats::Vector<> squashedInsts;
Stats::Vector<> ROBSquashedInsts;
- Stats::Scalar<> ROB_fcount;
- Stats::Formula ROB_full_rate;
+ Stats::Scalar<> ROBFcount;
+ Stats::Formula ROBFullRate;
- Stats::Vector<> ROB_count; // cumulative ROB occupancy
- Stats::Formula ROB_occ_rate;
- Stats::VectorDistribution<> ROB_occ_dist;
+ Stats::Vector<> ROBCount; // cumulative ROB occupancy
+ Stats::Formula ROBOccRate;
+// Stats::VectorDistribution<> ROBOccDist;
public:
void dumpInsts();
diff --git a/src/cpu/ozone/lw_back_end_impl.hh b/src/cpu/ozone/lw_back_end_impl.hh
index a4f1d805e..f87a2bc57 100644
--- a/src/cpu/ozone/lw_back_end_impl.hh
+++ b/src/cpu/ozone/lw_back_end_impl.hh
@@ -141,13 +141,14 @@ LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst)
template <class Impl>
LWBackEnd<Impl>::LWBackEnd(Params *params)
- : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5),
+ : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0),
trapSquash(false), tcSquash(false),
- width(params->backEndWidth), exactFullStall(true)
+ latency(params->backEndLatency),
+ width(params->backEndWidth), lsqLimits(params->lsqLimits),
+ exactFullStall(true)
{
numROBEntries = params->numROBEntries;
numInsts = 0;
- numDispatchEntries = 32;
maxOutstandingMemOps = params->maxOutstandingMemOps;
numWaitingMemOps = 0;
waitingInsts = 0;
@@ -184,78 +185,79 @@ void
LWBackEnd<Impl>::regStats()
{
using namespace Stats;
- rob_cap_events
+ LSQ.regStats();
+
+ robCapEvents
.init(cpu->number_of_threads)
.name(name() + ".ROB:cap_events")
.desc("number of cycles where ROB cap was active")
.flags(total)
;
- rob_cap_inst_count
+ robCapInstCount
.init(cpu->number_of_threads)
.name(name() + ".ROB:cap_inst")
.desc("number of instructions held up by ROB cap")
.flags(total)
;
- iq_cap_events
+ iqCapEvents
.init(cpu->number_of_threads)
.name(name() +".IQ:cap_events" )
.desc("number of cycles where IQ cap was active")
.flags(total)
;
- iq_cap_inst_count
+ iqCapInstCount
.init(cpu->number_of_threads)
.name(name() + ".IQ:cap_inst")
.desc("number of instructions held up by IQ cap")
.flags(total)
;
-
- exe_inst
+ exeInst
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:count")
.desc("number of insts issued")
.flags(total)
;
- exe_swp
+ exeSwp
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:swp")
.desc("number of swp insts issued")
.flags(total)
;
- exe_nop
+ exeNop
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:nop")
.desc("number of nop insts issued")
.flags(total)
;
- exe_refs
+ exeRefs
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:refs")
.desc("number of memory reference insts issued")
.flags(total)
;
- exe_loads
+ exeLoads
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:loads")
.desc("number of load insts issued")
.flags(total)
;
- exe_branches
+ exeBranches
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:branches")
.desc("Number of branches issued")
.flags(total)
;
- issued_ops
+ issuedOps
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:op_count")
.desc("number of insts issued")
@@ -272,28 +274,28 @@ LWBackEnd<Impl>::regStats()
//
// Other stats
//
- lsq_forw_loads
+ lsqForwLoads
.init(cpu->number_of_threads)
.name(name() + ".LSQ:forw_loads")
.desc("number of loads forwarded via LSQ")
.flags(total)
;
- inv_addr_loads
+ invAddrLoads
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:addr_loads")
.desc("number of invalid-address loads")
.flags(total)
;
- inv_addr_swpfs
+ invAddrSwpfs
.init(cpu->number_of_threads)
.name(name() + ".ISSUE:addr_swpfs")
.desc("number of invalid-address SW prefetches")
.flags(total)
;
- lsq_blocked_loads
+ lsqBlockedLoads
.init(cpu->number_of_threads)
.name(name() + ".LSQ:blocked_loads")
.desc("number of ready loads not issued due to memory disambiguation")
@@ -305,51 +307,52 @@ LWBackEnd<Impl>::regStats()
.desc("Number of times LSQ instruction issued early")
;
- n_issued_dist
+ nIssuedDist
.init(issueWidth + 1)
.name(name() + ".ISSUE:issued_per_cycle")
.desc("Number of insts issued each cycle")
.flags(total | pdf | dist)
;
- issue_delay_dist
+/*
+ issueDelayDist
.init(Num_OpClasses,0,99,2)
.name(name() + ".ISSUE:")
.desc("cycles from operands ready to issue")
.flags(pdf | cdf)
;
- queue_res_dist
+ queueResDist
.init(Num_OpClasses, 0, 99, 2)
.name(name() + ".IQ:residence:")
.desc("cycles from dispatch to issue")
.flags(total | pdf | cdf )
;
for (int i = 0; i < Num_OpClasses; ++i) {
- queue_res_dist.subname(i, opClassStrings[i]);
+ queueResDist.subname(i, opClassStrings[i]);
}
-
- writeback_count
+*/
+ writebackCount
.init(cpu->number_of_threads)
.name(name() + ".WB:count")
.desc("cumulative count of insts written-back")
.flags(total)
;
- producer_inst
+ producerInst
.init(cpu->number_of_threads)
.name(name() + ".WB:producers")
.desc("num instructions producing a value")
.flags(total)
;
- consumer_inst
+ consumerInst
.init(cpu->number_of_threads)
.name(name() + ".WB:consumers")
.desc("num instructions consuming a value")
.flags(total)
;
- wb_penalized
+ wbPenalized
.init(cpu->number_of_threads)
.name(name() + ".WB:penalized")
.desc("number of instrctions required to write to 'other' IQ")
@@ -357,71 +360,71 @@ LWBackEnd<Impl>::regStats()
;
- wb_penalized_rate
+ wbPenalizedRate
.name(name() + ".WB:penalized_rate")
.desc ("fraction of instructions written-back that wrote to 'other' IQ")
.flags(total)
;
- wb_penalized_rate = wb_penalized / writeback_count;
+ wbPenalizedRate = wbPenalized / writebackCount;
- wb_fanout
+ wbFanout
.name(name() + ".WB:fanout")
.desc("average fanout of values written-back")
.flags(total)
;
- wb_fanout = producer_inst / consumer_inst;
+ wbFanout = producerInst / consumerInst;
- wb_rate
+ wbRate
.name(name() + ".WB:rate")
.desc("insts written-back per cycle")
.flags(total)
;
- wb_rate = writeback_count / cpu->numCycles;
+ wbRate = writebackCount / cpu->numCycles;
- stat_com_inst
+ statComInst
.init(cpu->number_of_threads)
.name(name() + ".COM:count")
.desc("Number of instructions committed")
.flags(total)
;
- stat_com_swp
+ statComSwp
.init(cpu->number_of_threads)
.name(name() + ".COM:swp_count")
.desc("Number of s/w prefetches committed")
.flags(total)
;
- stat_com_refs
+ statComRefs
.init(cpu->number_of_threads)
.name(name() + ".COM:refs")
.desc("Number of memory references committed")
.flags(total)
;
- stat_com_loads
+ statComLoads
.init(cpu->number_of_threads)
.name(name() + ".COM:loads")
.desc("Number of loads committed")
.flags(total)
;
- stat_com_membars
+ statComMembars
.init(cpu->number_of_threads)
.name(name() + ".COM:membars")
.desc("Number of memory barriers committed")
.flags(total)
;
- stat_com_branches
+ statComBranches
.init(cpu->number_of_threads)
.name(name() + ".COM:branches")
.desc("Number of branches committed")
.flags(total)
;
- n_committed_dist
+ nCommittedDist
.init(0,commitWidth,1)
.name(name() + ".COM:committed_per_cycle")
.desc("Number of insts commited each cycle")
@@ -441,14 +444,14 @@ LWBackEnd<Impl>::regStats()
// -> The standard deviation is computed only over cycles where
// we reached the BW limit
//
- commit_eligible
+ commitEligible
.init(cpu->number_of_threads)
.name(name() + ".COM:bw_limited")
.desc("number of insts not committed due to BW limits")
.flags(total)
;
- commit_eligible_samples
+ commitEligibleSamples
.name(name() + ".COM:bw_lim_events")
.desc("number cycles where commit BW limit reached")
;
@@ -465,37 +468,38 @@ LWBackEnd<Impl>::regStats()
.desc("Number of instructions removed from inst list when they reached the head of the ROB")
;
- ROB_fcount
+ ROBFcount
.name(name() + ".ROB:full_count")
.desc("number of cycles where ROB was full")
;
- ROB_count
+ ROBCount
.init(cpu->number_of_threads)
.name(name() + ".ROB:occupancy")
.desc(name() + ".ROB occupancy (cumulative)")
.flags(total)
;
- ROB_full_rate
+ ROBFullRate
.name(name() + ".ROB:full_rate")
.desc("ROB full per cycle")
;
- ROB_full_rate = ROB_fcount / cpu->numCycles;
+ ROBFullRate = ROBFcount / cpu->numCycles;
- ROB_occ_rate
+ ROBOccRate
.name(name() + ".ROB:occ_rate")
.desc("ROB occupancy rate")
.flags(total)
;
- ROB_occ_rate = ROB_count / cpu->numCycles;
-
- ROB_occ_dist
+ ROBOccRate = ROBCount / cpu->numCycles;
+/*
+ ROBOccDist
.init(cpu->number_of_threads,0,numROBEntries,2)
.name(name() + ".ROB:occ_dist")
.desc("ROB Occupancy per cycle")
.flags(total | cdf)
;
+*/
}
template <class Impl>
@@ -588,17 +592,21 @@ LWBackEnd<Impl>::tick()
{
DPRINTF(BE, "Ticking back end\n");
+ // Read in any done instruction information and update the IQ or LSQ.
+ updateStructures();
+
if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) {
cpu->signalSwitched();
return;
}
- ROB_count[0]+= numInsts;
+ readyInstsForCommit();
- wbCycle = 0;
+ numInstsToWB.advance();
- // Read in any done instruction information and update the IQ or LSQ.
- updateStructures();
+ ROBCount[0]+= numInsts;
+
+ wbCycle = 0;
#if FULL_SYSTEM
checkInterrupts();
@@ -674,6 +682,10 @@ LWBackEnd<Impl>::dispatchInsts()
while (numInsts < numROBEntries &&
numWaitingMemOps < maxOutstandingMemOps) {
// Get instruction from front of time buffer
+ if (lsqLimits && LSQ.isFull()) {
+ break;
+ }
+
DynInstPtr inst = frontEnd->getInst();
if (!inst) {
break;
@@ -732,6 +744,7 @@ LWBackEnd<Impl>::dispatchInsts()
inst->setIssued();
inst->setExecuted();
inst->setCanCommit();
+ numInstsToWB[0]++;
} else {
DPRINTF(BE, "Instruction [sn:%lli] ready, addding to "
"exeList.\n",
@@ -866,8 +879,17 @@ LWBackEnd<Impl>::executeInsts()
if (inst->isLoad()) {
LSQ.executeLoad(inst);
} else if (inst->isStore()) {
- LSQ.executeStore(inst);
- if (inst->req && !(inst->req->getFlags() & LOCKED)) {
+ Fault fault = LSQ.executeStore(inst);
+
+ if (!inst->isStoreConditional() && fault == NoFault) {
+ inst->setExecuted();
+
+ instToCommit(inst);
+ } else if (fault != NoFault) {
+ // If the instruction faulted, then we need to send it along to commit
+ // without the instruction completing.
+ // Send this instruction to commit, also make sure iew stage
+ // realizes there is activity.
inst->setExecuted();
instToCommit(inst);
@@ -908,36 +930,54 @@ LWBackEnd<Impl>::executeInsts()
}
}
- issued_ops[0]+= num_executed;
- n_issued_dist[num_executed]++;
+ issuedOps[0]+= num_executed;
+ nIssuedDist[num_executed]++;
}
template<class Impl>
void
LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
{
-
DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n",
inst->seqNum, inst->readPC());
if (!inst->isSquashed()) {
- DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
- inst->seqNum, inst->readPC());
-
- inst->setCanCommit();
-
if (inst->isExecuted()) {
inst->setResultReady();
int dependents = wakeDependents(inst);
if (dependents) {
- producer_inst[0]++;
- consumer_inst[0]+= dependents;
+ producerInst[0]++;
+ consumerInst[0]+= dependents;
}
}
}
- writeback_count[0]++;
+ writeback.push_back(inst);
+
+ numInstsToWB[0]++;
+
+ writebackCount[0]++;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::readyInstsForCommit()
+{
+ for (int i = numInstsToWB[-latency];
+ !writeback.empty() && i;
+ --i)
+ {
+ DynInstPtr inst = writeback.front();
+ writeback.pop_front();
+ if (!inst->isSquashed()) {
+ DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
+ inst->seqNum, inst->readPC());
+
+ inst->setCanCommit();
+ }
+ }
}
+
#if 0
template <class Impl>
void
@@ -1010,7 +1050,7 @@ LWBackEnd<Impl>::commitInst(int inst_num)
// or store inst. Signal backwards that it should be executed.
if (!inst->isExecuted()) {
if (inst->isNonSpeculative() ||
- inst->isStoreConditional() ||
+ (inst->isStoreConditional() && inst->getFault() == NoFault) ||
inst->isMemBarrier() ||
inst->isWriteBarrier()) {
#if !FULL_SYSTEM
@@ -1151,6 +1191,20 @@ LWBackEnd<Impl>::commitInst(int inst_num)
++freed_regs;
}
+#if FULL_SYSTEM
+ if (thread->profile) {
+// bool usermode =
+// (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0;
+// thread->profilePC = usermode ? 1 : inst->readPC();
+ thread->profilePC = inst->readPC();
+ ProfileNode *node = thread->profile->consume(thread->getXCProxy(),
+ inst->staticInst);
+
+ if (node)
+ thread->profileNode = node;
+ }
+#endif
+
if (inst->traceData) {
inst->traceData->setFetchSeq(inst->seqNum);
inst->traceData->setCPSeq(thread->numInst);
@@ -1158,6 +1212,9 @@ LWBackEnd<Impl>::commitInst(int inst_num)
inst->traceData = NULL;
}
+ if (inst->isCopy())
+ panic("Should not commit any copy instructions!");
+
inst->clearDependents();
frontEnd->addFreeRegs(freed_regs);
@@ -1207,9 +1264,9 @@ LWBackEnd<Impl>::commitInsts()
while (!instList.empty() && inst_num < commitWidth) {
if (instList.back()->isSquashed()) {
instList.back()->clearDependents();
+ ROBSquashedInsts[instList.back()->threadNumber]++;
instList.pop_back();
--numInsts;
- ROBSquashedInsts[instList.back()->threadNumber]++;
continue;
}
@@ -1221,7 +1278,7 @@ LWBackEnd<Impl>::commitInsts()
break;
}
}
- n_committed_dist.sample(inst_num);
+ nCommittedDist.sample(inst_num);
}
template <class Impl>
@@ -1231,10 +1288,10 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
LSQ.squash(sn);
int freed_regs = 0;
- InstListIt waiting_list_end = waitingList.end();
+ InstListIt insts_end_it = waitingList.end();
InstListIt insts_it = waitingList.begin();
- while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn)
+ while (insts_it != insts_end_it && (*insts_it)->seqNum > sn)
{
if ((*insts_it)->isSquashed()) {
++insts_it;
@@ -1260,6 +1317,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
while (!instList.empty() && (*insts_it)->seqNum > sn)
{
if ((*insts_it)->isSquashed()) {
+ panic("Instruction should not be already squashed and on list!");
++insts_it;
continue;
}
@@ -1291,18 +1349,6 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
--numInsts;
}
- insts_it = waitingList.begin();
- while (!waitingList.empty() && insts_it != waitingList.end()) {
- if ((*insts_it)->seqNum < sn) {
- ++insts_it;
- continue;
- }
- assert((*insts_it)->isSquashed());
-
- waitingList.erase(insts_it++);
- waitingInsts--;
- }
-
while (memBarrier && memBarrier->seqNum > sn) {
DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously "
"squashed)\n", memBarrier->seqNum);
@@ -1320,6 +1366,18 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
}
}
+ insts_it = replayList.begin();
+ insts_end_it = replayList.end();
+ while (!replayList.empty() && insts_it != insts_end_it) {
+ if ((*insts_it)->seqNum < sn) {
+ ++insts_it;
+ continue;
+ }
+ assert((*insts_it)->isSquashed());
+
+ replayList.erase(insts_it++);
+ }
+
frontEnd->addFreeRegs(freed_regs);
}
@@ -1392,14 +1450,6 @@ LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst)
template <class Impl>
void
-LWBackEnd<Impl>::fetchFault(Fault &fault)
-{
- faultFromFetch = fault;
- fetchHasFault = true;
-}
-
-template <class Impl>
-void
LWBackEnd<Impl>::switchOut()
{
switchPending = true;
@@ -1416,17 +1466,25 @@ LWBackEnd<Impl>::doSwitchOut()
// yet written back.
assert(robEmpty());
assert(!LSQ.hasStoresToWB());
-
+ writeback.clear();
+ for (int i = 0; i < numInstsToWB.getSize() + 1; ++i)
+ numInstsToWB.advance();
+
+// squash(0);
+ assert(waitingList.empty());
+ assert(instList.empty());
+ assert(replayList.empty());
+ assert(writeback.empty());
LSQ.switchOut();
-
- squash(0);
}
template <class Impl>
void
LWBackEnd<Impl>::takeOverFrom(ThreadContext *old_tc)
{
- switchedOut = false;
+ assert(!squashPending);
+ squashSeqNum = 0;
+ squashNextPC = 0;
tcSquash = false;
trapSquash = false;
@@ -1451,27 +1509,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
//
#ifdef TARGET_ALPHA
if (inst->isDataPrefetch())
- exe_swp[thread_number]++;
+ exeSwp[thread_number]++;
else
- exe_inst[thread_number]++;
+ exeInst[thread_number]++;
#else
- exe_inst[thread_number]++;
+ exeInst[thread_number]++;
#endif
//
// Control operations
//
if (inst->isControl())
- exe_branches[thread_number]++;
+ exeBranches[thread_number]++;
//
// Memory operations
//
if (inst->isMemRef()) {
- exe_refs[thread_number]++;
+ exeRefs[thread_number]++;
if (inst->isLoad())
- exe_loads[thread_number]++;
+ exeLoads[thread_number]++;
}
}
@@ -1491,33 +1549,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
//
#ifdef TARGET_ALPHA
if (inst->isDataPrefetch()) {
- stat_com_swp[tid]++;
+ statComSwp[tid]++;
} else {
- stat_com_inst[tid]++;
+ statComInst[tid]++;
}
#else
- stat_com_inst[tid]++;
+ statComInst[tid]++;
#endif
//
// Control Instructions
//
if (inst->isControl())
- stat_com_branches[tid]++;
+ statComBranches[tid]++;
//
// Memory references
//
if (inst->isMemRef()) {
- stat_com_refs[tid]++;
+ statComRefs[tid]++;
if (inst->isLoad()) {
- stat_com_loads[tid]++;
+ statComLoads[tid]++;
}
}
if (inst->isMemBarrier()) {
- stat_com_membars[tid]++;
+ statComMembars[tid]++;
}
}
@@ -1569,6 +1627,45 @@ LWBackEnd<Impl>::dumpInsts()
++num;
}
+ inst_list_it = --(writeback.end());
+
+ cprintf("Writeback list size: %i\n", writeback.size());
+
+ while (inst_list_it != writeback.end())
+ {
+ cprintf("Instruction:%i\n",
+ num);
+ if (!(*inst_list_it)->isSquashed()) {
+ if (!(*inst_list_it)->isIssued()) {
+ ++valid_num;
+ cprintf("Count:%i\n", valid_num);
+ } else if ((*inst_list_it)->isMemRef() &&
+ !(*inst_list_it)->memOpDone) {
+ // Loads that have not been marked as executed still count
+ // towards the total instructions.
+ ++valid_num;
+ cprintf("Count:%i\n", valid_num);
+ }
+ }
+
+ cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+ "Issued:%i\nSquashed:%i\n",
+ (*inst_list_it)->readPC(),
+ (*inst_list_it)->seqNum,
+ (*inst_list_it)->threadNumber,
+ (*inst_list_it)->isIssued(),
+ (*inst_list_it)->isSquashed());
+
+ if ((*inst_list_it)->isMemRef()) {
+ cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+ }
+
+ cprintf("\n");
+
+ inst_list_it--;
+ ++num;
+ }
+
cprintf("Waiting list size: %i\n", waitingList.size());
inst_list_it = --(waitingList.end());
diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh
index 9a21a9d01..6640a9f34 100644
--- a/src/cpu/ozone/lw_lsq.hh
+++ b/src/cpu/ozone/lw_lsq.hh
@@ -84,6 +84,8 @@ class OzoneLWLSQ {
/** Returns the name of the LSQ unit. */
std::string name() const;
+ void regStats();
+
/** Sets the CPU pointer. */
void setCPU(OzoneCPU *cpu_ptr);
@@ -179,7 +181,7 @@ class OzoneLWLSQ {
int numLoads() { return loads; }
/** Returns the number of stores in the SQ. */
- int numStores() { return stores; }
+ int numStores() { return stores + storesInFlight; }
/** Returns if either the LQ or SQ is full. */
bool isFull() { return lqFull() || sqFull(); }
@@ -188,7 +190,7 @@ class OzoneLWLSQ {
bool lqFull() { return loads >= (LQEntries - 1); }
/** Returns if the SQ is full. */
- bool sqFull() { return stores >= (SQEntries - 1); }
+ bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); }
/** Debugging function to dump instructions in the LSQ. */
void dumpInsts();
@@ -223,7 +225,9 @@ class OzoneLWLSQ {
void storePostSend(Packet *pkt, DynInstPtr &inst);
/** Completes the store at the specified index. */
- void completeStore(int store_idx);
+ void completeStore(DynInstPtr &inst);
+
+ void removeStore(int store_idx);
/** Handles doing the retry. */
void recvRetry();
@@ -394,6 +398,10 @@ class OzoneLWLSQ {
int storesToWB;
+ public:
+ int storesInFlight;
+
+ private:
/// @todo Consider moving to a more advanced model with write vs read ports
/** The number of cache ports available each cycle. */
int cachePorts;
@@ -403,6 +411,9 @@ class OzoneLWLSQ {
//list<InstSeqNum> mshrSeqNums;
+ /** Tota number of memory ordering violations. */
+ Stats::Scalar<> lsqMemOrderViolation;
+
//Stats::Scalar<> dcacheStallCycles;
Counter lastDcacheStall;
@@ -525,7 +536,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
store_size = (*sq_it).size;
- if (store_size == 0) {
+ if (store_size == 0 || (*sq_it).committed) {
sq_it++;
continue;
}
diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh
index 7eef4b11f..31ffa9d67 100644
--- a/src/cpu/ozone/lw_lsq_impl.hh
+++ b/src/cpu/ozone/lw_lsq_impl.hh
@@ -132,7 +132,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt)
template <class Impl>
OzoneLWLSQ<Impl>::OzoneLWLSQ()
: switchedOut(false), dcachePort(this), loads(0), stores(0),
- storesToWB(0), stalled(false), isStoreBlocked(false),
+ storesToWB(0), storesInFlight(0), stalled(false), isStoreBlocked(false),
isLoadBlocked(false), loadBlockedHandled(false)
{
}
@@ -173,6 +173,11 @@ OzoneLWLSQ<Impl>::name() const
template<class Impl>
void
+OzoneLWLSQ<Impl>::regStats()
+{
+ lsqMemOrderViolation
+ .name(name() + ".memOrderViolation")
+ .desc("Number of memory ordering violations");
OzoneLWLSQ<Impl>::setCPU(OzoneCPU *cpu_ptr)
{
cpu = cpu_ptr;
@@ -321,7 +326,7 @@ unsigned
OzoneLWLSQ<Impl>::numFreeEntries()
{
unsigned free_lq_entries = LQEntries - loads;
- unsigned free_sq_entries = SQEntries - stores;
+ unsigned free_sq_entries = SQEntries - (stores + storesInFlight);
// Both the LQ and SQ entries have an extra dummy entry to differentiate
// empty/full conditions. Subtract 1 from the free entries.
@@ -385,6 +390,9 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst)
// Actually probably want the oldest faulting load
if (load_fault != NoFault) {
DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum);
+ if (!(inst->req->flags & UNCACHEABLE && !inst->isAtCommit())) {
+ inst->setExecuted();
+ }
// Maybe just set it as can commit here, although that might cause
// some other problems with sending traps to the ROB too quickly.
be->instToCommit(inst);
@@ -461,6 +469,7 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst)
// A load incorrectly passed this store. Squash and refetch.
// For now return a fault to show that it was unsuccessful.
memDepViolator = (*lq_it);
+ ++lsqMemOrderViolation;
return TheISA::genMachineCheckFault();
}
@@ -553,8 +562,8 @@ OzoneLWLSQ<Impl>::writebackStores()
if ((*sq_it).size == 0 && !(*sq_it).completed) {
sq_it--;
- completeStore(inst->sqIdx);
-
+ removeStore(inst->sqIdx);
+ completeStore(inst);
continue;
}
@@ -626,6 +635,8 @@ OzoneLWLSQ<Impl>::writebackStores()
inst->sqIdx,inst->readPC(),
req->paddr, *(req->data),
inst->seqNum);
+ DPRINTF(OzoneLSQ, "StoresInFlight: %i\n",
+ storesInFlight + 1);
if (dcacheInterface) {
assert(!req->completionEvent);
@@ -687,6 +698,8 @@ OzoneLWLSQ<Impl>::writebackStores()
}
sq_it--;
}
+ ++storesInFlight;
+// removeStore(inst->sqIdx);
} else {
panic("Must HAVE DCACHE!!!!!\n");
}
@@ -704,7 +717,7 @@ void
OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num)
{
DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!"
- "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+ "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight);
LQIt lq_it = loadQueue.begin();
@@ -881,7 +894,7 @@ OzoneLWLSQ<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt)
template <class Impl>
void
-OzoneLWLSQ<Impl>::completeStore(int store_idx)
+OzoneLWLSQ<Impl>::removeStore(int store_idx)
{
SQHashIt sq_hash_it = SQItHash.find(store_idx);
assert(sq_hash_it != SQItHash.end());
@@ -891,8 +904,6 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
(*sq_it).completed = true;
DynInstPtr inst = (*sq_it).inst;
- --storesToWB;
-
if (isStalled() &&
inst->seqNum == stallingStoreIsn) {
DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
@@ -910,6 +921,13 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
SQItHash.erase(sq_hash_it);
SQIndices.push(inst->sqIdx);
storeQueue.erase(sq_it);
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::completeStore(DynInstPtr &inst)
+{
+ --storesToWB;
--stores;
inst->setCompleted();
@@ -935,9 +953,14 @@ OzoneLWLSQ<Impl>::switchOut()
switchedOut = true;
// Clear the queue to free up resources
+ assert(stores == 0);
+ assert(storeQueue.empty());
+ assert(loads == 0);
+ assert(loadQueue.empty());
+ assert(storesInFlight == 0);
storeQueue.clear();
loadQueue.clear();
- loads = stores = storesToWB = 0;
+ loads = stores = storesToWB = storesInFlight = 0;
}
template <class Impl>
diff --git a/src/cpu/ozone/simple_params.hh b/src/cpu/ozone/simple_params.hh
index 11cee716f..3f63d2e1d 100644
--- a/src/cpu/ozone/simple_params.hh
+++ b/src/cpu/ozone/simple_params.hh
@@ -71,10 +71,11 @@ class SimpleParams : public BaseCPU::Params
unsigned cachePorts;
unsigned width;
+ unsigned frontEndLatency;
unsigned frontEndWidth;
+ unsigned backEndLatency;
unsigned backEndWidth;
unsigned backEndSquashLatency;
- unsigned backEndLatency;
unsigned maxInstBufferSize;
unsigned numPhysicalRegs;
unsigned maxOutstandingMemOps;
@@ -150,6 +151,7 @@ class SimpleParams : public BaseCPU::Params
//
unsigned LQEntries;
unsigned SQEntries;
+ bool lsqLimits;
//
// Memory dependence
diff --git a/src/cpu/ozone/thread_state.hh b/src/cpu/ozone/thread_state.hh
index 8234cf938..adaa8e71b 100644
--- a/src/cpu/ozone/thread_state.hh
+++ b/src/cpu/ozone/thread_state.hh
@@ -34,9 +34,12 @@
#include "arch/faults.hh"
#include "arch/types.hh"
#include "arch/regfile.hh"
+#include "base/callback.hh"
+#include "base/output.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
#include "sim/process.hh"
+#include "sim/sim_exit.hh"
class Event;
//class Process;
@@ -65,8 +68,21 @@ struct OzoneThreadState : public ThreadState {
#if FULL_SYSTEM
OzoneThreadState(CPUType *_cpu, int _thread_num)
: ThreadState(-1, _thread_num),
- intrflag(0), inSyscall(0), trapPending(0)
+ cpu(_cpu), intrflag(0), inSyscall(0), trapPending(0)
{
+ if (cpu->params->profile) {
+ profile = new FunctionProfile(cpu->params->system->kernelSymtab);
+ Callback *cb =
+ new MakeCallback<OzoneThreadState,
+ &OzoneThreadState::dumpFuncProfile>(this);
+ registerExitCallback(cb);
+ }
+
+ // let's fill with a dummy node for now so we don't get a segfault
+ // on the first cycle when there's no node available.
+ static ProfileNode dummyNode;
+ profileNode = &dummyNode;
+ profilePC = 3;
miscRegFile.clear();
}
#else
@@ -130,6 +146,14 @@ struct OzoneThreadState : public ThreadState {
void setNextPC(uint64_t val)
{ nextPC = val; }
+
+#if FULL_SYSTEM
+ void dumpFuncProfile()
+ {
+ std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name()));
+ profile->dump(xcProxy, *os);
+ }
+#endif
};
#endif // __CPU_OZONE_THREAD_STATE_HH__
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index f801b93fa..522fe79aa 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -170,7 +170,7 @@ BaseSimpleCPU::regStats()
void
BaseSimpleCPU::resetStats()
{
- startNumInst = numInst;
+// startNumInst = numInst;
// notIdleFraction = (_status != Idle);
}
diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc
index 5f86cf2b7..4fc47c982 100644
--- a/src/cpu/simple_thread.cc
+++ b/src/cpu/simple_thread.cc
@@ -162,6 +162,11 @@ SimpleThread::takeOverFrom(ThreadContext *oldContext)
if (quiesceEvent) {
quiesceEvent->tc = tc;
}
+
+ Kernel::Statistics *stats = oldContext->getKernelStats();
+ if (stats) {
+ kernelStats = stats;
+ }
#endif
storeCondFailures = 0;
diff --git a/src/cpu/thread_state.hh b/src/cpu/thread_state.hh
index 6e985054f..5479f8478 100644
--- a/src/cpu/thread_state.hh
+++ b/src/cpu/thread_state.hh
@@ -32,6 +32,7 @@
#define __CPU_THREAD_STATE_HH__
#include "arch/types.hh"
+#include "cpu/profile.hh"
#include "cpu/thread_context.hh"
#if !FULL_SYSTEM
@@ -191,6 +192,21 @@ struct ThreadState {
// simulation only; all functional memory accesses should use
// one of the FunctionalMemory pointers above.
short asid;
+
+#endif
+
+#if FULL_SYSTEM
+ void profileClear()
+ {
+ if (profile)
+ profile->clear();
+ }
+
+ void profileSample()
+ {
+ if (profile)
+ profile->sample(profileNode, profilePC);
+ }
#endif
/** Current instruction the thread is committing. Only set and