43 files changed, 843 insertions, 311 deletions
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index ce440aeff..ea4b03bf2 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -41,6 +41,7 @@
 #include "cpu/cpuevent.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/profile.hh"
+#include "sim/sim_exit.hh"
 #include "sim/param.hh"
 #include "sim/process.hh"
 #include "sim/sim_events.hh"
@@ -48,6 +49,9 @@
 
 #include "base/trace.hh"
 
+// Hack
+#include "sim/stat_control.hh"
+
 using namespace std;
 
 vector<BaseCPU *> BaseCPU::cpuList;
@@ -57,6 +61,39 @@ vector<BaseCPU *> BaseCPU::cpuList;
 // been initialized
 int maxThreadsPerCPU = 1;
 
+CPUProgressEvent::CPUProgressEvent(EventQueue *q, Tick ival,
+                                   BaseCPU *_cpu)
+    : Event(q, Event::Stat_Event_Pri), interval(ival),
+      lastNumInst(0), cpu(_cpu)
+{
+    if (interval)
+        schedule(curTick + interval);
+}
+
+void
+CPUProgressEvent::process()
+{
+    Counter temp = cpu->totalInstructions();
+#ifndef NDEBUG
+    double ipc = double(temp - lastNumInst) / (interval / cpu->cycles(1));
+
+    DPRINTFN("%s progress event, instructions committed: %lli, IPC: %0.8d\n",
+             cpu->name(), temp - lastNumInst, ipc);
+    ipc = 0.0;
+#else
+    cprintf("%lli: %s progress event, instructions committed: %lli\n",
+            curTick, cpu->name(), temp - lastNumInst);
+#endif
+    lastNumInst = temp;
+    schedule(curTick + interval);
+}
+
+const char *
+CPUProgressEvent::description()
+{
+    return "CPU Progress event";
+}
+
 #if FULL_SYSTEM
 BaseCPU::BaseCPU(Params *p)
     : MemObject(p->name), clock(p->clock), checkInterrupts(true),
@@ -67,6 +104,7 @@ BaseCPU::BaseCPU(Params *p)
       number_of_threads(p->numberOfThreads), system(p->system)
 #endif
 {
+//    currentTick = curTick;
     DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this);
 
     // add self to global list of CPUs
@@ -88,8 +126,9 @@ BaseCPU::BaseCPU(Params *p)
     //
     if (p->max_insts_any_thread != 0)
         for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comInstEventQueue[i], p->max_insts_any_thread,
-                                 "a thread reached the max instruction count");
+            schedExitSimLoop("a thread reached the max instruction count",
+                             p->max_insts_any_thread, 0,
+                             comInstEventQueue[i]);
 
     if (p->max_insts_all_threads != 0) {
         // allocate & initialize shared downcounter: each event will
@@ -113,8 +152,9 @@ BaseCPU::BaseCPU(Params *p)
     //
     if (p->max_loads_any_thread != 0)
         for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comLoadEventQueue[i], p->max_loads_any_thread,
-                                 "a thread reached the max load count");
+            schedExitSimLoop("a thread reached the max load count",
+                             p->max_loads_any_thread, 0,
+                             comLoadEventQueue[i]);
 
     if (p->max_loads_all_threads != 0) {
         // allocate & initialize shared downcounter: each event will
@@ -153,7 +193,6 @@ BaseCPU::BaseCPU(Params *p)
     if (params->profile)
         profileEvent = new ProfileEvent(this, params->profile);
 #endif
-
 }
 
 BaseCPU::Params::Params()
@@ -188,6 +227,11 @@ BaseCPU::startup()
     if (!params->deferRegistration && profileEvent)
         profileEvent->schedule(curTick);
 #endif
+
+    if (params->progress_interval) {
+        new CPUProgressEvent(&mainEventQueue, params->progress_interval,
+                             this);
+    }
 }
 
 
@@ -238,7 +282,11 @@ BaseCPU::registerThreadContexts()
 void
 BaseCPU::switchOut()
 {
-    panic("This CPU doesn't support sampling!");
+//    panic("This CPU doesn't support sampling!");
+#if FULL_SYSTEM
+    if (profileEvent && profileEvent->scheduled())
+        profileEvent->deschedule();
+#endif
 }
 
 void
@@ -261,18 +309,22 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU)
         assert(newTC->getProcessPtr() == oldTC->getProcessPtr());
         newTC->getProcessPtr()->replaceThreadContext(newTC, newTC->readCpuId());
 #endif
+
+//    TheISA::compareXCs(oldXC, newXC);
     }
 
 #if FULL_SYSTEM
     for (int i = 0; i < TheISA::NumInterruptLevels; ++i)
         interrupts[i] = oldCPU->interrupts[i];
     intstatus = oldCPU->intstatus;
+    checkInterrupts = oldCPU->checkInterrupts;
 
     for (int i = 0; i < threadContexts.size(); ++i)
         threadContexts[i]->profileClear();
 
-    if (profileEvent)
-        profileEvent->schedule(curTick);
+    // The Sampler must take care of this!
+//    if (profileEvent)
+//        profileEvent->schedule(curTick);
 #endif
 }
 
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 2be6e4e81..e02527371 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -46,6 +46,21 @@ class ThreadContext;
 class System;
 class Port;
 
+class CPUProgressEvent : public Event
+{
+  protected:
+    Tick interval;
+    Counter lastNumInst;
+    BaseCPU *cpu;
+
+  public:
+    CPUProgressEvent(EventQueue *q, Tick ival, BaseCPU *_cpu);
+
+    void process();
+
+    virtual const char *description();
+};
+
 class BaseCPU : public MemObject
 {
   protected:
@@ -53,6 +68,7 @@ class BaseCPU : public MemObject
     Tick clock;
 
   public:
+//    Tick currentTick;
     inline Tick frequency() const { return Clock::Frequency / clock; }
     inline Tick cycles(int numCycles) const { return clock * numCycles; }
     inline Tick curCycle() const { return curTick / clock; }
@@ -128,6 +144,7 @@ class BaseCPU : public MemObject
         int cpu_id;
         Tick profile;
 #endif
+        Tick progress_interval;
         BaseCPU *checker;
 
         Params();
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 3158aa9cf..c68810954 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -197,7 +197,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
 
     union Result {
         uint64_t integer;
-        float fp;
+//        float fp;
         double dbl;
     };
 
@@ -394,7 +394,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     uint64_t readIntResult() { return instResult.integer; }
 
     /** Returns the result of a floating point instruction. */
-    float readFloatResult() { return instResult.fp; }
+    float readFloatResult() { return (float)instResult.dbl; }
 
     /** Returns the result of a floating point (double) instruction. */
     double readDoubleResult() { return instResult.dbl; }
@@ -409,7 +409,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     void setFloatReg(const StaticInst *si, int idx, FloatReg val, int width)
     {
         if (width == 32)
-            instResult.fp = val;
+            instResult.dbl = (double)val;
         else if (width == 64)
             instResult.dbl = val;
         else
@@ -419,7 +419,8 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Records an fp register being set to a value. */
     void setFloatReg(const StaticInst *si, int idx, FloatReg val)
     {
-        instResult.fp = val;
+//        instResult.fp = val;
+        instResult.dbl = (double)val;
     }
 
     /** Records an fp register being set to an integer value. */
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index 6d6ae1e0a..00b01171f 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -102,6 +102,7 @@ class CheckerCPU : public BaseCPU
         Process *process;
 #endif
         bool exitOnError;
+        bool updateOnError;
         bool warnOnlyOnLoadError;
     };
 
@@ -148,7 +149,7 @@ class CheckerCPU : public BaseCPU
 
     union Result {
         uint64_t integer;
-        float fp;
+//        float fp;
         double dbl;
     };
 
@@ -257,7 +258,7 @@ class CheckerCPU : public BaseCPU
         thread->setFloatReg(reg_idx, val, width);
         switch(width) {
           case 32:
-            result.fp = val;
+            result.dbl = (double)val;
             break;
           case 64:
             result.dbl = val;
@@ -269,7 +270,7 @@ class CheckerCPU : public BaseCPU
     {
         int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag;
         thread->setFloatReg(reg_idx, val);
-        result.fp = val;
+        result.dbl = (double)val;
     }
 
     void setFloatRegBits(const StaticInst *si, int idx, FloatRegBits val,
@@ -318,7 +319,7 @@ class CheckerCPU : public BaseCPU
         return thread->setMiscRegWithEffect(misc_reg, val);
     }
 
-    void recordPCChange(uint64_t val) { changedPC = true; }
+    void recordPCChange(uint64_t val) { changedPC = true; newPC = val; }
     void recordNextPCChange(uint64_t val) { changedNextPC = true; }
 
     bool translateInstReq(Request *req);
@@ -360,6 +361,7 @@ class CheckerCPU : public BaseCPU
     uint64_t newPC;
     bool changedNextPC;
     bool exitOnError;
+    bool updateOnError;
     bool warnOnlyOnLoadError;
 
     InstSeqNum youngestSN;
@@ -376,7 +378,7 @@ class Checker : public CheckerCPU
 {
   public:
     Checker(Params *p)
-        : CheckerCPU(p)
+        : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL)
     { }
 
     void switchOut();
@@ -393,12 +395,19 @@ class Checker : public CheckerCPU
   private:
     void handleError(DynInstPtr &inst)
     {
-        if (exitOnError)
+        if (exitOnError) {
             dumpAndExit(inst);
+        } else if (updateOnError) {
+            updateThisCycle = true;
+        }
     }
 
     void dumpAndExit(DynInstPtr &inst);
 
+    bool updateThisCycle;
+
+    DynInstPtr unverifiedInst;
+
     std::list<DynInstPtr> instList;
     typedef typename std::list<DynInstPtr>::iterator InstListIt;
     void dumpInsts();
diff --git a/src/cpu/checker/cpu_impl.hh b/src/cpu/checker/cpu_impl.hh
index 81f97726c..8aec79754 100644
--- a/src/cpu/checker/cpu_impl.hh
+++ b/src/cpu/checker/cpu_impl.hh
@@ -94,6 +94,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
         }
     }
 
+    unverifiedInst = inst;
+
     // Try to check all instructions that are completed, ending if we
     // run out of instructions to check or if an instruction is not
     // yet completed.
@@ -171,7 +173,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
                 thread->setPC(thread->readNextPC());
                 thread->setNextPC(thread->readNextPC() + sizeof(MachInst));
 
-                return;
+                break;
             } else {
                 // The instruction is carrying an ITB fault.  Handle
                 // the fault and see if our results match the CPU on
@@ -220,7 +222,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
 
             thread->funcExeInst++;
 
-            fault = curStaticInst->execute(this, NULL);
+            if (!inst->isUnverifiable())
+                fault = curStaticInst->execute(this, NULL);
 
             // Checks to make sure instrution results are correct.
             validateExecution(inst);
@@ -289,6 +292,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst)
             break;
         }
     }
+    unverifiedInst = NULL;
 }
 
 template <class DynInstPtr>
@@ -395,6 +399,24 @@ template <class DynInstPtr>
 void
 Checker<DynInstPtr>::validateState()
 {
+    if (updateThisCycle) {
+        warn("%lli: Instruction PC %#x results didn't match up, copying all "
+             "registers from main CPU", curTick, unverifiedInst->readPC());
+        // Heavy-weight copying of all registers
+        thread->copyArchRegs(unverifiedInst->tcBase());
+        // Also advance the PC.  Hopefully no PC-based events happened.
+#if THE_ISA != MIPS_ISA
+        // go to the next instruction
+        thread->setPC(thread->readNextPC());
+        thread->setNextPC(thread->readNextPC() + sizeof(MachInst));
+#else
+        // go to the next instruction
+        thread->setPC(thread->readNextPC());
+        thread->setNextPC(thread->readNextNPC());
+        thread->setNextNPC(thread->readNextNPC() + sizeof(MachInst));
+#endif
+        updateThisCycle = false;
+    }
 }
 
 template <class DynInstPtr>
diff --git a/src/cpu/o3/alpha/cpu_builder.cc b/src/cpu/o3/alpha/cpu_builder.cc
index 5e767655d..ff123a6f7 100644
--- a/src/cpu/o3/alpha/cpu_builder.cc
+++ b/src/cpu/o3/alpha/cpu_builder.cc
@@ -56,6 +56,7 @@ SimObjectParam<System *> system;
 Param<int> cpu_id;
 SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
+Param<Tick> profile;
 #else
 SimObjectVectorParam<Process *> workload;
 #endif // FULL_SYSTEM
@@ -68,6 +69,7 @@ Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
 Param<Counter> max_loads_all_threads;
+Param<Tick> progress_interval;
 
 Param<unsigned> cachePorts;
 
@@ -162,6 +164,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
     INIT_PARAM(cpu_id, "processor ID"),
     INIT_PARAM(itb, "Instruction translation buffer"),
     INIT_PARAM(dtb, "Data translation buffer"),
+    INIT_PARAM(profile, ""),
 #else
     INIT_PARAM(workload, "Processes to run"),
 #endif // FULL_SYSTEM
@@ -184,6 +187,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
                     "Terminate when all threads have reached this load"
                     "count",
                     0),
+    INIT_PARAM_DFLT(progress_interval, "Progress interval", 0),
 
     INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
 
@@ -305,6 +309,7 @@ CREATE_SIM_OBJECT(DerivO3CPU)
     params->cpu_id = cpu_id;
     params->itb = itb;
     params->dtb = dtb;
+    params->profile = profile;
 #else
     params->workload = workload;
 #endif // FULL_SYSTEM
@@ -317,6 +322,7 @@ CREATE_SIM_OBJECT(DerivO3CPU)
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
     params->max_loads_all_threads = max_loads_all_threads;
+    params->progress_interval = progress_interval;
 
     //
     // Caches
diff --git a/src/cpu/o3/checker_builder.cc b/src/cpu/o3/checker_builder.cc
index 782d963b0..02c817499 100644
--- a/src/cpu/o3/checker_builder.cc
+++ b/src/cpu/o3/checker_builder.cc
@@ -64,6 +64,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
     Param<Counter> max_insts_all_threads;
     Param<Counter> max_loads_any_thread;
     Param<Counter> max_loads_all_threads;
+    Param<Tick> progress_interval;
 
 #if FULL_SYSTEM
     SimObjectParam<AlphaITB *> itb;
@@ -78,6 +79,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker)
 
     Param<bool> defer_registration;
     Param<bool> exitOnError;
+    Param<bool> updateOnError;
     Param<bool> warnOnlyOnLoadError;
     Param<bool> function_trace;
     Param<Tick> function_trace_start;
@@ -94,6 +96,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker)
                "terminate when any thread reaches this load count"),
     INIT_PARAM(max_loads_all_threads,
                "terminate when all threads have reached this load count"),
+    INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0),
 
 #if FULL_SYSTEM
     INIT_PARAM(itb, "Instruction TLB"),
@@ -109,6 +112,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker)
 
     INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
     INIT_PARAM(exitOnError, "exit on error"),
+    INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"),
     INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load "
                     "result errors", false),
     INIT_PARAM(function_trace, "Enable function trace"),
@@ -127,6 +131,7 @@ CREATE_SIM_OBJECT(O3Checker)
     params->max_loads_any_thread = 0;
     params->max_loads_all_threads = 0;
     params->exitOnError = exitOnError;
+    params->updateOnError = updateOnError;
     params->warnOnlyOnLoadError = warnOnlyOnLoadError;
     params->deferRegistration = defer_registration;
     params->functionTrace = function_trace;
@@ -139,6 +144,9 @@ CREATE_SIM_OBJECT(O3Checker)
     temp = max_insts_all_threads;
     temp = max_loads_any_thread;
     temp = max_loads_all_threads;
+    Tick temp2 = progress_interval;
+    params->progress_interval = 0;
+    temp2++;
 
 #if FULL_SYSTEM
     params->itb = itb;
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 34f487e2c..c80e4d8c1 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -1083,12 +1083,26 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
         // Generate trap squash event.
         generateTrapEvent(tid);
-
+//        warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC());
         return false;
     }
 
     updateComInstStats(head_inst);
 
+#if FULL_SYSTEM
+    if (thread[tid]->profile) {
+//        bool usermode =
+//            (cpu->readMiscReg(AlphaISA::IPR_DTB_CM, tid) & 0x18) != 0;
+//        thread[tid]->profilePC = usermode ? 1 : head_inst->readPC();
+        thread[tid]->profilePC = head_inst->readPC();
+        ProfileNode *node = thread[tid]->profile->consume(thread[tid]->getTC(),
+                                                          head_inst->staticInst);
+
+        if (node)
+            thread[tid]->profileNode = node;
+    }
+#endif
+
     if (head_inst->traceData) {
         head_inst->traceData->setFetchSeq(head_inst->seqNum);
         head_inst->traceData->setCPSeq(thread[tid]->numInst);
@@ -1102,6 +1116,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                                  head_inst->renamedDestRegIdx(i));
     }
 
+    if (head_inst->isCopy())
+        panic("Should not commit any copy instructions!");
+
     // Finally clear the head ROB entry.
     rob->retireHead(tid);
 
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 19ab7f4c5..7386dfadd 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -33,6 +33,7 @@
 #include "config/use_checker.hh"
 
 #if FULL_SYSTEM
+#include "cpu/quiesce_event.hh"
 #include "sim/system.hh"
 #else
 #include "sim/process.hh"
@@ -793,6 +794,7 @@ template <class Impl>
 unsigned int
 FullO3CPU<Impl>::drain(Event *drain_event)
 {
+    DPRINTF(O3CPU, "Switching out\n");
     drainCount = 0;
     fetch.drain();
     decode.drain();
@@ -849,6 +851,8 @@ FullO3CPU<Impl>::signalDrained()
 
         changeState(SimObject::Drained);
 
+        BaseCPU::switchOut();
+
         if (drainEvent) {
             drainEvent->process();
             drainEvent = NULL;
@@ -863,6 +867,7 @@ FullO3CPU<Impl>::switchOut()
 {
     fetch.switchOut();
     rename.switchOut();
+    iew.switchOut();
     commit.switchOut();
     instList.clear();
     while (!removeList.empty()) {
@@ -874,6 +879,8 @@ FullO3CPU<Impl>::switchOut()
     if (checker)
         checker->switchOut();
 #endif
+    if (tickEvent.scheduled())
+        tickEvent.squash();
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 1e080181c..2d447bfe5 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -442,6 +442,7 @@ DefaultFetch<Impl>::takeOverFrom()
     wroteToTimeBuffer = false;
     _status = Inactive;
     switchedOut = false;
+    interruptPending = false;
     branchPred.takeOverFrom();
 }
 
@@ -563,7 +564,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    if (cacheBlocked || (interruptPending && flags == 0)) {
+    if (cacheBlocked || isSwitchedOut() || (interruptPending && flags == 0)) {
         // Hold off fetch from getting new instructions when:
         // Cache is blocked, or
         // while an interrupt is pending and we're not in PAL mode, or
@@ -1152,8 +1153,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("cycle %lli: Quiesce instruction encountered, halting fetch!",
-                     curTick);
+//                warn("%lli: Quiesce instruction encountered, halting fetch!",
+//                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1268,7 +1269,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
-        warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]);
+//        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 #else // !FULL_SYSTEM
         warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 76fa008ee..a400c9fa8 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -216,6 +216,7 @@ class DefaultIEW
         if (++wbOutstanding == wbMax)
             ableToIssue = false;
         DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding <= wbMax);
 #ifdef DEBUG
         wbList.insert(sn);
 #endif
@@ -226,6 +227,7 @@ class DefaultIEW
         if (wbOutstanding-- == wbMax)
             ableToIssue = true;
         DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding >= 0);
 #ifdef DEBUG
         assert(wbList.find(sn) != wbList.end());
         wbList.erase(sn);
@@ -450,7 +452,9 @@ class DefaultIEW
     unsigned wbCycle;
 
     /** Number of instructions in flight that will writeback. */
-    unsigned wbOutstanding;
+
+    /** Number of instructions in flight that will writeback. */
+    int wbOutstanding;
 
     /** Writeback width. */
     unsigned wbWidth;
@@ -507,6 +511,8 @@ class DefaultIEW
     Stats::Scalar<> iewExecutedInsts;
     /** Stat for total number of executed load instructions. */
     Stats::Vector<> iewExecLoadInsts;
+    /** Stat for total number of executed store instructions. */
+//    Stats::Scalar<> iewExecStoreInsts;
     /** Stat for total number of squashed instructions skipped at execute. */
     Stats::Scalar<> iewExecSquashedInsts;
     /** Number of executed software prefetches. */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index e9b24a6d4..b2baae296 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -162,17 +162,17 @@ DefaultIEW<Impl>::regStats()
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 
     iewExecutedInsts
-        .name(name() + ".EXEC:insts")
+        .name(name() + ".iewExecutedInsts")
         .desc("Number of executed instructions");
 
     iewExecLoadInsts
         .init(cpu->number_of_threads)
-        .name(name() + ".EXEC:loads")
+        .name(name() + ".iewExecLoadInsts")
         .desc("Number of load instructions executed")
         .flags(total);
 
     iewExecSquashedInsts
-        .name(name() + ".EXEC:squashedInsts")
+        .name(name() + ".iewExecSquashedInsts")
         .desc("Number of squashed instructions skipped in execute");
 
     iewExecutedSwp
@@ -372,6 +372,8 @@ DefaultIEW<Impl>::switchOut()
 {
     // Clear any state.
     switchedOut = true;
+    assert(insts[0].empty());
+    assert(skidBuffer[0].empty());
 
     instQueue.switchOut();
     ldstQueue.switchOut();
@@ -410,7 +412,6 @@ DefaultIEW<Impl>::takeOverFrom()
 
     updateLSQNextCycle = false;
 
-    // @todo: Fix hardcoded number
     for (int i = 0; i < issueToExecQueue.getSize(); ++i) {
         issueToExecQueue.advance();
     }
@@ -611,9 +612,11 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
             wbNumInst = 0;
         }
 
-        assert((wbCycle * wbWidth + wbNumInst) < wbMax);
+        assert((wbCycle * wbWidth + wbNumInst) <= wbMax);
     }
 
+    DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n",
+            wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst);
     // Add finished instruction to queue to commit.
     (*iewQueue)[wbCycle].insts[wbNumInst] = inst;
     (*iewQueue)[wbCycle].size++;
@@ -1273,13 +1276,23 @@ DefaultIEW<Impl>::executeInsts()
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
             } else if (inst->isStore()) {
-                ldstQueue.executeStore(inst);
+                fault = ldstQueue.executeStore(inst);
 
                 // If the store had a fault then it may not have a mem req
-                if (inst->req && !(inst->req->getFlags() & LOCKED)) {
+                if (!inst->isStoreConditional() && fault == NoFault) {
                     inst->setExecuted();
 
                     instToCommit(inst);
+                } else if (fault != NoFault) {
+                    // If the instruction faulted, then we need to send it along to commit
+                    // without the instruction completing.
+
+                    // Send this instruction to commit, also make sure iew stage
+                    // realizes there is activity.
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                    activityThisCycle();
                 }
 
                 // Store conditionals will mark themselves as
@@ -1404,7 +1417,7 @@ DefaultIEW<Impl>::writebackInsts()
         // E.g. Uncached loads have not actually executed when they
         // are first sent to commit.  Instead commit must tell the LSQ
         // when it's ready to execute the uncached load.
-        if (!inst->isSquashed() && inst->isExecuted()) {
+        if (!inst->isSquashed() && inst->isExecuted() && inst->getFault() == NoFault) {
             int dependents = instQueue.wakeDependents(inst);
 
             for (int i = 0; i < inst->numDestRegs(); i++) {
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index d745faf7b..3dd4dc658 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -479,13 +479,13 @@ class InstructionQueue
     /** Distribution of number of instructions in the queue.
      * @todo: Need to create struct to track the entry time for each
      * instruction. */
-    Stats::VectorDistribution<> queueResDist;
+//    Stats::VectorDistribution<> queueResDist;
     /** Distribution of the number of instructions issued. */
     Stats::Distribution<> numIssuedDist;
     /** Distribution of the cycles it takes to issue an instruction.
      * @todo: Need to create struct to track the ready time for each
      * instruction. */
-    Stats::VectorDistribution<> issueDelayDist;
+//    Stats::VectorDistribution<> issueDelayDist;
 
     /** Number of times an instruction could not be issued because a
      * FU was busy.
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index 47634f645..6edb528a9 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -230,7 +230,7 @@ InstructionQueue<Impl>::regStats()
         .name(name() + ".iqSquashedNonSpecRemoved")
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
-
+/*
     queueResDist
         .init(Num_OpClasses, 0, 99, 2)
         .name(name() + ".IQ:residence:")
@@ -240,6 +240,7 @@ InstructionQueue<Impl>::regStats()
     for (int i = 0; i < Num_OpClasses; ++i) {
         queueResDist.subname(i, opClassStrings[i]);
     }
+*/
     numIssuedDist
         .init(0,totalWidth,1)
         .name(name() + ".ISSUE:issued_per_cycle")
@@ -268,7 +269,7 @@ InstructionQueue<Impl>::regStats()
     //
     //  How long did instructions for a particular FU type wait prior to issue
     //
-
+/*
     issueDelayDist
         .init(Num_OpClasses,0,99,2)
         .name(name() + ".ISSUE:")
@@ -281,7 +282,7 @@ InstructionQueue<Impl>::regStats()
         subname << opClassStrings[i] << "_delay";
         issueDelayDist.subname(i, subname.str());
     }
-
+*/
     issueRate
         .name(name() + ".ISSUE:rate")
         .desc("Inst issue rate")
@@ -385,8 +386,16 @@ template <class Impl>
 void
 InstructionQueue<Impl>::switchOut()
 {
+/*
+    if (!instList[0].empty() || (numEntries != freeEntries) ||
+        !readyInsts[0].empty() || !nonSpecInsts.empty() || !listOrder.empty()) {
+        dumpInsts();
+//        assert(0);
+    }
+*/
     resetState();
     dependGraph.reset();
+    instsToExecute.clear();
     switchedOut = true;
     for (int i = 0; i < numThreads; ++i) {
         memDepUnit[i].switchOut();
@@ -642,9 +651,12 @@ template <class Impl>
 void
 InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
 {
+    DPRINTF(IQ, "Processing FU completion [sn:%lli]\n", inst->seqNum);
     // The CPU could have been sleeping until this op completed (*extremely*
     // long latency op).  Wake it if it was.  This may be overkill.
     if (isSwitchedOut()) {
+        DPRINTF(IQ, "FU completion not processed, IQ is switched out [sn:%lli]\n",
+                inst->seqNum);
         return;
     }
 
@@ -1036,6 +1048,10 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
             (squashed_inst->isMemRef() &&
              !squashed_inst->memOpDone)) {
 
+            DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+                    "squashed.\n",
+                    tid, squashed_inst->seqNum, squashed_inst->readPC());
+
             // Remove the instruction from the dependency list.
             if (!squashed_inst->isNonSpeculative() &&
                 !squashed_inst->isStoreConditional() &&
@@ -1066,7 +1082,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
 
                     ++iqSquashedOperandsExamined;
                 }
-            } else {
+            } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) {
                 NonSpecMapIt ns_inst_it =
                     nonSpecInsts.find(squashed_inst->seqNum);
                 assert(ns_inst_it != nonSpecInsts.end());
@@ -1093,10 +1109,6 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
             count[squashed_inst->threadNumber]--;
 
             ++freeEntries;
-
-            DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
-                    "squashed.\n",
-                    tid, squashed_inst->seqNum, squashed_inst->readPC());
         }
 
         instList[tid].erase(squash_it--);
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 1358a3699..90d1a3d53 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -407,7 +407,6 @@ class LSQUnit {
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
-
     /** Total number of loads forwaded from LSQ stores. */
     Stats::Scalar<> lsqForwLoads;
 
@@ -421,6 +420,9 @@ class LSQUnit {
      * ignored due to the instruction already being squashed. */
     Stats::Scalar<> lsqIgnoredResponses;
 
+    /** Tota number of memory ordering violations. */
+    Stats::Scalar<> lsqMemOrderViolation;
+
     /** Total number of squashed stores. */
     Stats::Scalar<> lsqSquashedStores;
 
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index fa716c712..98bea74fb 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -180,6 +180,10 @@ LSQUnit<Impl>::regStats()
         .name(name() + ".ignoredResponses")
         .desc("Number of memory responses ignored because the instruction is squashed");
 
+    lsqMemOrderViolation
+        .name(name() + ".memOrderViolation")
+        .desc("Number of memory ordering violations");
+
     lsqSquashedStores
         .name(name() + ".squashedStores")
         .desc("Number of stores squashed");
@@ -220,8 +224,10 @@ void
 LSQUnit<Impl>::switchOut()
 {
     switchedOut = true;
-    for (int i = 0; i < loadQueue.size(); ++i)
+    for (int i = 0; i < loadQueue.size(); ++i) {
+        assert(!loadQueue[i]);
         loadQueue[i] = NULL;
+    }
 
     assert(storesToWB == 0);
 }
@@ -408,6 +414,11 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     if (load_fault != NoFault) {
         // Send this instruction to commit, also make sure iew stage
         // realizes there is activity.
+        // Mark it as executed unless it is an uncached load that
+        // needs to hit the head of commit.
+        if (!(inst->req->getFlags() & UNCACHEABLE) || inst->isAtCommit()) {
+            inst->setExecuted();
+        }
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     }
@@ -467,6 +478,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
                 // A load incorrectly passed this store.  Squash and refetch.
                 // For now return a fault to show that it was unsuccessful.
                 memDepViolator = loadQueue[load_idx];
+                ++lsqMemOrderViolation;
 
                 return genMachineCheckFault();
             }
@@ -820,6 +832,7 @@ LSQUnit<Impl>::completeStore(int store_idx)
     // A bit conservative because a store completion may not free up entries,
     // but hopefully avoids two store completions in one cycle from making
     // the CPU tick twice.
+    cpu->wakeCPU();
     cpu->activityThisCycle();
 
     if (store_idx == storeHead) {
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 16f67a4e0..c649ca385 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -109,6 +109,9 @@ template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::switchOut()
 {
+    assert(instList[0].empty());
+    assert(instsToReplay.empty());
+    assert(memDepHash.empty());
     // Clear any state.
     for (int i = 0; i < Impl::MaxThreads; ++i) {
         instList[i].clear();
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index ba26a01dd..177b9cb87 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -417,6 +417,8 @@ class DefaultRename
     /** The maximum skid buffer size. */
     unsigned skidBufferMax;
 
+    PhysRegIndex maxPhysicalRegs;
+
     /** Enum to record the source of a structure full stall.  Can come from
      * either ROB, IQ, LSQ, and it is priortized in that order.
      */
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index 782c0fe5f..248d7deb6 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -41,7 +41,8 @@ DefaultRename<Impl>::DefaultRename(Params *params)
       commitToRenameDelay(params->commitToRenameDelay),
       renameWidth(params->renameWidth),
       commitWidth(params->commitWidth),
-      numThreads(params->numberOfThreads)
+      numThreads(params->numberOfThreads),
+      maxPhysicalRegs(params->numPhysIntRegs + params->numPhysFloatRegs)
 {
     _status = Inactive;
 
@@ -286,6 +287,11 @@ DefaultRename<Impl>::switchOut()
             // Put the renamed physical register back on the free list.
             freeList->addReg(hb_it->newPhysReg);
 
+            // Be sure to mark its register as ready if it's a misc register.
+            if (hb_it->newPhysReg >= maxPhysicalRegs) {
+                scoreboard->setReg(hb_it->newPhysReg);
+            }
+
             historyBuffer[i].erase(hb_it++);
         }
         insts[i].clear();
@@ -889,6 +895,11 @@ DefaultRename<Impl>::doSquash(const InstSeqNum &squashed_seq_num, unsigned tid)
         // Put the renamed physical register back on the free list.
         freeList->addReg(hb_it->newPhysReg);
 
+        // Be sure to mark its register as ready if it's a misc register.
+        if (hb_it->newPhysReg >= maxPhysicalRegs) {
+            scoreboard->setReg(hb_it->newPhysReg);
+        }
+
         historyBuffer[tid].erase(hb_it++);
 
         ++renameUndoneMaps;
diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh
index a4546e669..25e1db21c 100755
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -54,7 +54,7 @@ template <class Impl>
 void
 O3ThreadContext<Impl>::dumpFuncProfile()
 {
-    // Currently not supported
+    thread->dumpFuncProfile();
 }
 #endif
 
@@ -239,12 +239,16 @@ O3ThreadContext<Impl>::readLastSuspend()
 template <class Impl>
 void
 O3ThreadContext<Impl>::profileClear()
-{}
+{
+    thread->profileClear();
+}
 
 template <class Impl>
 void
 O3ThreadContext<Impl>::profileSample()
-{}
+{
+    thread->profileSample();
+}
 #endif
 
 template <class Impl>
diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh
index b6f2e14c0..5fe7bb94d 100644
--- a/src/cpu/o3/thread_state.hh
+++ b/src/cpu/o3/thread_state.hh
@@ -31,8 +31,11 @@
 #ifndef __CPU_O3_THREAD_STATE_HH__
 #define __CPU_O3_THREAD_STATE_HH__
 
+#include "base/callback.hh"
+#include "base/output.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
+#include "sim/sim_exit.hh"
 
 class Event;
 class Process;
@@ -75,8 +78,22 @@ struct O3ThreadState : public ThreadState {
 #if FULL_SYSTEM
     O3ThreadState(O3CPU *_cpu, int _thread_num)
         : ThreadState(-1, _thread_num),
-          inSyscall(0), trapPending(0)
-    { }
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    {
+        if (cpu->params->profile) {
+            profile = new FunctionProfile(cpu->params->system->kernelSymtab);
+            Callback *cb =
+                new MakeCallback<O3ThreadState,
+                &O3ThreadState::dumpFuncProfile>(this);
+            registerExitCallback(cb);
+        }
+
+        // let's fill with a dummy node for now so we don't get a segfault
+        // on the first cycle when there's no node available.
+        static ProfileNode dummyNode;
+        profileNode = &dummyNode;
+        profilePC = 3;
+    }
 #else
     O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process, int _asid,
                   MemObject *mem)
@@ -95,6 +112,14 @@ struct O3ThreadState : public ThreadState {
     /** Handles the syscall. */
     void syscall(int64_t callnum) { process->syscall(callnum, tc); }
 #endif
+
+#if FULL_SYSTEM
+    void dumpFuncProfile()
+    {
+        std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name()));
+        profile->dump(tc, *os);
+    }
+#endif
 };
 
 #endif // __CPU_O3_THREAD_STATE_HH__
diff --git a/src/cpu/o3/tournament_pred.cc b/src/cpu/o3/tournament_pred.cc
index 7cf78dcb1..ffb941c77 100644
--- a/src/cpu/o3/tournament_pred.cc
+++ b/src/cpu/o3/tournament_pred.cc
@@ -62,6 +62,8 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize,
     for (int i = 0; i < localPredictorSize; ++i)
         localCtrs[i].setBits(localCtrBits);
 
+    localPredictorMask = floorPow2(localPredictorSize) - 1;
+
     if (!isPowerOf2(localHistoryTableSize)) {
         fatal("Invalid local history table size!\n");
     }
@@ -158,7 +160,7 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history)
     //Lookup in the local predictor to get its branch prediction
     local_history_idx = calcLocHistIdx(branch_addr);
     local_predictor_idx = localHistoryTable[local_history_idx]
-        & localHistoryMask;
+        & localPredictorMask;
     local_prediction = localCtrs[local_predictor_idx].read() > threshold;
 
     //Lookup in the global predictor to get its branch prediction
@@ -176,7 +178,8 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history)
     bp_history = (void *)history;
 
     assert(globalHistory < globalPredictorSize &&
-           local_history_idx < localPredictorSize);
+           local_history_idx < localHistoryTableSize &&
+           local_predictor_idx < localPredictorSize);
 
     // Commented code is for doing speculative update of counters and
     // all histories.
@@ -234,7 +237,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history)
     // Get the local predictor's current prediction
     local_history_idx = calcLocHistIdx(branch_addr);
     local_predictor_hist = localHistoryTable[local_history_idx];
-    local_predictor_idx = local_predictor_hist & localHistoryMask;
+    local_predictor_idx = local_predictor_hist & localPredictorMask;
 
     // Update the choice predictor to tell it which one was correct if
     // there was a prediction.
@@ -256,6 +259,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history)
     }
 
     assert(globalHistory < globalPredictorSize &&
+           local_history_idx < localHistoryTableSize &&
            local_predictor_idx < localPredictorSize);
 
     // Update the counters and local history with the proper
diff --git a/src/cpu/o3/tournament_pred.hh b/src/cpu/o3/tournament_pred.hh
index 66b4aaae2..472944910 100644
--- a/src/cpu/o3/tournament_pred.hh
+++ b/src/cpu/o3/tournament_pred.hh
@@ -159,6 +159,9 @@ class TournamentBP
     /** Size of the local predictor. */
     unsigned localPredictorSize;
 
+    /** Mask to get the proper index bits into the predictor. */
+    unsigned localPredictorMask;
+
     /** Number of bits of the local predictor's counters. */
     unsigned localCtrBits;
 
diff --git a/src/cpu/ozone/checker_builder.cc b/src/cpu/ozone/checker_builder.cc
index c372e51d6..b4c4686b7 100644
--- a/src/cpu/ozone/checker_builder.cc
+++ b/src/cpu/ozone/checker_builder.cc
@@ -65,6 +65,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
     Param<Counter> max_insts_all_threads;
     Param<Counter> max_loads_any_thread;
     Param<Counter> max_loads_all_threads;
+    Param<Tick> progress_interval;
 
 #if FULL_SYSTEM
     SimObjectParam<AlphaITB *> itb;
@@ -79,6 +80,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker)
 
     Param<bool> defer_registration;
     Param<bool> exitOnError;
+    Param<bool> updateOnError;
     Param<bool> warnOnlyOnLoadError;
     Param<bool> function_trace;
     Param<Tick> function_trace_start;
@@ -95,6 +97,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
                "terminate when any thread reaches this load count"),
     INIT_PARAM(max_loads_all_threads,
                "terminate when all threads have reached this load count"),
+    INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0),
 
 #if FULL_SYSTEM
     INIT_PARAM(itb, "Instruction TLB"),
@@ -110,6 +113,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker)
 
     INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
     INIT_PARAM(exitOnError, "exit on error"),
+    INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"),
     INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load "
                     "result errors", false),
     INIT_PARAM(function_trace, "Enable function trace"),
@@ -128,6 +132,7 @@ CREATE_SIM_OBJECT(OzoneChecker)
     params->max_loads_any_thread = 0;
     params->max_loads_all_threads = 0;
     params->exitOnError = exitOnError;
+    params->updateOnError = updateOnError;
     params->warnOnlyOnLoadError = warnOnlyOnLoadError;
     params->deferRegistration = defer_registration;
     params->functionTrace = function_trace;
@@ -140,6 +145,9 @@ CREATE_SIM_OBJECT(OzoneChecker)
     temp = max_insts_all_threads;
     temp = max_loads_any_thread;
     temp = max_loads_all_threads;
+    Tick temp2 = progress_interval;
+    temp2++;
+    params->progress_interval = 0;
 
 #if FULL_SYSTEM
     params->itb = itb;
diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh
index e411c12bd..8c5be9424 100644
--- a/src/cpu/ozone/cpu.hh
+++ b/src/cpu/ozone/cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,6 +33,7 @@
 
 #include <set>
 
+#include "arch/regfile.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "config/full_system.hh"
@@ -81,13 +82,13 @@ template <class>
 class Checker;
 
 /**
- * Declaration of Out-of-Order CPU class.  Basically it is a SimpleCPU with
- * simple out-of-order capabilities added to it.  It is still a 1 CPI machine
- * (?), but is capable of handling cache misses.  Basically it models having
- * a ROB/IQ by only allowing a certain amount of instructions to execute while
- * the cache miss is outstanding.
+ * Light weight out of order CPU model that approximates an out of
+ * order CPU.  It is separated into a front end and a back end, with
+ * the template parameter Impl describing the classes used for each.
+ * The goal is to be able to specify through the Impl the class to use
+ * for the front end and back end, with different classes used to
+ * model different levels of detail.
  */
-
 template <class Impl>
 class OzoneCPU : public BaseCPU
 {
@@ -257,8 +258,8 @@ class OzoneCPU : public BaseCPU
         void setFuncExeInst(Counter new_val)
         { thread->funcExeInst = new_val; }
 #endif
-        void changeRegFileContext(TheISA::RegFile::ContextParam param,
-                                          TheISA::RegFile::ContextVal val)
+        void changeRegFileContext(TheISA::RegContextParam param,
+                                          TheISA::RegContextVal val)
         { panic("Not supported on Alpha!"); }
     };
 
@@ -273,6 +274,7 @@ class OzoneCPU : public BaseCPU
     typedef OzoneThreadState<Impl> ImplState;
 
   private:
+    // Committed thread state for the OzoneCPU.
     OzoneThreadState<Impl> thread;
 
   public:
@@ -310,12 +312,6 @@ class OzoneCPU : public BaseCPU
             tickEvent.squash();
     }
 
-  private:
-    Trace::InstRecord *traceData;
-
-    template<typename T>
-    void trace_data(T data);
-
   public:
     enum Status {
         Running,
@@ -326,8 +322,6 @@ class OzoneCPU : public BaseCPU
     Status _status;
 
   public:
-    bool checkInterrupts;
-
     void post_interrupt(int int_num, int index);
 
     void zero_fill_64(Addr addr) {
@@ -379,6 +373,7 @@ class OzoneCPU : public BaseCPU
     FrontEnd *frontEnd;
 
     BackEnd *backEnd;
+
   private:
     Status status() const { return _status; }
     void setStatus(Status new_status) { _status = new_status; }
@@ -410,12 +405,11 @@ class OzoneCPU : public BaseCPU
     // number of idle cycles
     Stats::Average<> notIdleFraction;
     Stats::Formula idleFraction;
-  public:
 
+  public:
     virtual void serialize(std::ostream &os);
     virtual void unserialize(Checkpoint *cp, const std::string &section);
 
-
 #if FULL_SYSTEM
     /** Translates instruction requestion. */
     Fault translateInstReq(RequestPtr &req, OzoneThreadState<Impl> *thread)
@@ -582,12 +576,9 @@ class OzoneCPU : public BaseCPU
 
     Fault copy(Addr dest);
 
-    InstSeqNum globalSeqNum;
-
   public:
     void squashFromTC();
 
-    // @todo: This can be a useful debug function.  Implement it.
     void dumpInsts() { frontEnd->dumpInsts(); }
 
 #if FULL_SYSTEM
@@ -605,7 +596,6 @@ class OzoneCPU : public BaseCPU
 
     ThreadContext *tcBase() { return tc; }
 
-    bool decoupledFrontEnd;
     struct CommStruct {
         InstSeqNum doneSeqNum;
         InstSeqNum nonSpecSeqNum;
@@ -614,8 +604,13 @@ class OzoneCPU : public BaseCPU
 
         bool stall;
     };
+
+    InstSeqNum globalSeqNum;
+
     TimeBuffer<CommStruct> comm;
 
+    bool decoupledFrontEnd;
+
     bool lockFlag;
 
     Stats::Scalar<> quiesceCycles;
diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc
index e239b7a94..730158258 100644
--- a/src/cpu/ozone/cpu_builder.cc
+++ b/src/cpu/ozone/cpu_builder.cc
@@ -63,6 +63,7 @@ SimObjectParam<System *> system;
 Param<int> cpu_id;
 SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
+Param<Tick> profile;
 #else
 SimObjectVectorParam<Process *> workload;
 //SimObjectParam<PageTable *> page_table;
@@ -76,16 +77,18 @@ Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
 Param<Counter> max_loads_all_threads;
+Param<Tick> progress_interval;
 
 //SimObjectParam<BaseCache *> icache;
 //SimObjectParam<BaseCache *> dcache;
 
 Param<unsigned> cachePorts;
 Param<unsigned> width;
+Param<unsigned> frontEndLatency;
 Param<unsigned> frontEndWidth;
+Param<unsigned> backEndLatency;
 Param<unsigned> backEndWidth;
 Param<unsigned> backEndSquashLatency;
-Param<unsigned> backEndLatency;
 Param<unsigned> maxInstBufferSize;
 Param<unsigned> numPhysicalRegs;
 Param<unsigned> maxOutstandingMemOps;
@@ -140,6 +143,7 @@ Param<unsigned> RASSize;
 
 Param<unsigned> LQEntries;
 Param<unsigned> SQEntries;
+Param<bool> lsqLimits;
 Param<unsigned> LFSTSize;
 Param<unsigned> SSITSize;
 
@@ -181,6 +185,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
     INIT_PARAM(cpu_id, "processor ID"),
     INIT_PARAM(itb, "Instruction translation buffer"),
     INIT_PARAM(dtb, "Data translation buffer"),
+    INIT_PARAM(profile, ""),
 #else
     INIT_PARAM(workload, "Processes to run"),
 //    INIT_PARAM(page_table, "Page table"),
@@ -204,16 +209,18 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
                     "Terminate when all threads have reached this load"
                     "count",
                     0),
+    INIT_PARAM_DFLT(progress_interval, "Progress interval", 0),
 
 //    INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
 //    INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
 
     INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
     INIT_PARAM_DFLT(width, "Width", 1),
+    INIT_PARAM_DFLT(frontEndLatency, "Front end latency", 1),
     INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1),
+    INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
     INIT_PARAM_DFLT(backEndWidth, "Back end width", 1),
     INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1),
-    INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1),
     INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16),
     INIT_PARAM(numPhysicalRegs, "Number of physical registers"),
     INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4),
@@ -274,6 +281,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
 
     INIT_PARAM(LQEntries, "Number of load queue entries"),
     INIT_PARAM(SQEntries, "Number of store queue entries"),
+    INIT_PARAM_DFLT(lsqLimits, "LSQ size limits dispatch", true),
     INIT_PARAM(LFSTSize, "Last fetched store table size"),
     INIT_PARAM(SSITSize, "Store set ID table size"),
 
@@ -336,6 +344,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->cpu_id = cpu_id;
     params->itb = itb;
     params->dtb = dtb;
+    params->profile = profile;
 #else
     params->workload = workload;
 //    params->pTable = page_table;
@@ -347,6 +356,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
     params->max_loads_all_threads = max_loads_all_threads;
+    params->progress_interval = progress_interval;
 
     //
     // Caches
@@ -357,6 +367,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
 
     params->width = width;
     params->frontEndWidth = frontEndWidth;
+    params->frontEndLatency = frontEndLatency;
     params->backEndWidth = backEndWidth;
     params->backEndSquashLatency = backEndSquashLatency;
     params->backEndLatency = backEndLatency;
@@ -414,6 +425,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
 
     params->LQEntries = LQEntries;
     params->SQEntries = SQEntries;
+    params->lsqLimits = lsqLimits;
 
     params->SSITSize = SSITSize;
     params->LFSTSize = LFSTSize;
diff --git a/src/cpu/ozone/cpu_impl.hh b/src/cpu/ozone/cpu_impl.hh
index 80f18434c..bf547bf94 100644
--- a/src/cpu/ozone/cpu_impl.hh
+++ b/src/cpu/ozone/cpu_impl.hh
@@ -35,6 +35,7 @@
 #include "arch/isa_traits.hh" // For MachInst
 #include "base/trace.hh"
 #include "cpu/base.hh"
+#include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/ozone/cpu.hh"
@@ -50,9 +51,9 @@
 #include "arch/alpha/types.hh"
 #include "arch/vtophys.hh"
 #include "base/callback.hh"
-//#include "base/remote_gdb.hh"
 #include "cpu/profile.hh"
 #include "kern/kernel_stats.hh"
+#include "mem/physical.hh"
 #include "sim/faults.hh"
 #include "sim/sim_events.hh"
 #include "sim/sim_exit.hh"
@@ -68,15 +69,6 @@
 using namespace TheISA;
 
 template <class Impl>
-template<typename T>
-void
-OzoneCPU<Impl>::trace_data(T data) {
-    if (traceData) {
-        traceData->setData(data);
-    }
-}
-
-template <class Impl>
 OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w)
     : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w)
 {
@@ -126,6 +118,8 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
         panic("Checker enabled but not compiled in!");
 #endif
     } else {
+        // If checker is not being used, then the xcProxy points
+        // directly to the CPU's ExecContext.
         checker = NULL;
         thread.tc = &ozoneTC;
         tc = &ozoneTC;
@@ -138,7 +132,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 
     thread.setStatus(ThreadContext::Suspended);
 #if FULL_SYSTEM
-    /***** All thread state stuff *****/
+    // Setup thread state stuff.
     thread.cpu = this;
     thread.setTid(0);
 
@@ -187,12 +181,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     frontEnd->setBackEnd(backEnd);
     backEnd->setFrontEnd(frontEnd);
 
-    decoupledFrontEnd = p->decoupledFrontEnd;
-
     globalSeqNum = 1;
 
+#if FULL_SYSTEM
     checkInterrupts = false;
+#endif
 
+    lockFlag = 0;
+
+    // Setup rename table, initializing all values to ready.
     for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
         thread.renameTable[i] = new DynInst(this);
         thread.renameTable[i]->setResultReady();
@@ -233,8 +230,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     thread.setVirtPort(virt_port);
 #endif
 
-    lockFlag = 0;
-
     DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n");
 }
 
@@ -247,6 +242,7 @@ template <class Impl>
 void
 OzoneCPU<Impl>::switchOut()
 {
+    BaseCPU::switchOut();
     switchCount = 0;
     // Front end needs state from back end, so switch out the back end first.
     backEnd->switchOut();
@@ -257,6 +253,8 @@ template <class Impl>
 void
 OzoneCPU<Impl>::signalSwitched()
 {
+    // Only complete the switchout when both the front end and back
+    // end have signalled they are ready to switch.
     if (++switchCount == 2) {
         backEnd->doSwitchOut();
         frontEnd->doSwitchOut();
@@ -266,6 +264,17 @@ OzoneCPU<Impl>::signalSwitched()
 #endif
 
         _status = SwitchedOut;
+#ifndef NDEBUG
+        // Loop through all registers
+        for (int i = 0; i < AlphaISA::TotalNumRegs; ++i) {
+            assert(thread.renameTable[i] == frontEnd->renameTable[i]);
+
+            assert(thread.renameTable[i] == backEnd->renameTable[i]);
+
+            DPRINTF(OzoneCPU, "Checking if register %i matches.\n", i);
+        }
+#endif
+
         if (tickEvent.scheduled())
             tickEvent.squash();
     }
@@ -278,13 +287,25 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
     BaseCPU::takeOverFrom(oldCPU);
 
+    thread.trapPending = false;
+    thread.inSyscall = false;
+
     backEnd->takeOverFrom();
     frontEnd->takeOverFrom();
+    frontEnd->renameTable.copyFrom(thread.renameTable);
+    backEnd->renameTable.copyFrom(thread.renameTable);
     assert(!tickEvent.scheduled());
 
+#ifndef NDEBUG
+    // Check rename table.
+    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
+        assert(thread.renameTable[i]->isResultReady());
+    }
+#endif
+
     // @todo: Fix hardcoded number
     // Clear out any old information in time buffer.
-    for (int i = 0; i < 6; ++i) {
+    for (int i = 0; i < 15; ++i) {
         comm.advance();
     }
 
@@ -316,6 +337,10 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay)
     notIdleFraction++;
     scheduleTickEvent(delay);
     _status = Running;
+#if FULL_SYSTEM
+    if (thread.quiesceEvent && thread.quiesceEvent->scheduled())
+        thread.quiesceEvent->deschedule();
+#endif
     thread.setStatus(ThreadContext::Active);
     frontEnd->wakeFromQuiesce();
 }
@@ -393,7 +418,7 @@ template <class Impl>
 void
 OzoneCPU<Impl>::resetStats()
 {
-    startNumInst = numInst;
+//    startNumInst = numInst;
     notIdleFraction = (_status != Idle);
 }
 
@@ -441,6 +466,15 @@ OzoneCPU<Impl>::serialize(std::ostream &os)
     ozoneTC.serialize(os);
     nameOut(os, csprintf("%s.tickEvent", name()));
     tickEvent.serialize(os);
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // write out the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    nameOut(os, csprintf("%s.xc.0", name()));
+    temp.copyTC(thread.getTC());
+    temp.serialize(os);
 }
 
 template <class Impl>
@@ -451,6 +485,15 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
     UNSERIALIZE_ENUM(_status);
     ozoneTC.unserialize(cp, csprintf("%s.tc", section));
     tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // read in the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    temp.copyTC(thread.getTC());
+    temp.unserialize(cp, csprintf("%s.xc.0", section));
+    thread.getTC()->copyArchRegs(temp.getTC());
 }
 
 template <class Impl>
@@ -705,11 +748,13 @@ OzoneCPU<Impl>::processInterrupts()
     if (ipl && ipl > thread.readMiscReg(IPR_IPLR)) {
         thread.setMiscReg(IPR_ISR, summary);
         thread.setMiscReg(IPR_INTID, ipl);
+#if USE_CHECKER
         // @todo: Make this more transparent
         if (checker) {
             checker->threadBase()->setMiscReg(IPR_ISR, summary);
             checker->threadBase()->setMiscReg(IPR_INTID, ipl);
         }
+#endif
         Fault fault = new InterruptFault;
         fault->invoke(thread.getTC());
         DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
@@ -810,7 +855,9 @@ OzoneCPU<Impl>::OzoneTC::halt()
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneTC::dumpFuncProfile()
-{ }
+{
+    thread->dumpFuncProfile();
+}
 #endif
 
 template <class Impl>
@@ -829,6 +876,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context)
     copyArchRegs(old_context);
     setCpuId(old_context->readCpuId());
 
+    thread->setInst(old_context->getInst());
 #if !FULL_SYSTEM
     setFuncExeInst(old_context->readFuncExeInst());
 #else
@@ -842,6 +890,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context)
         thread->quiesceEvent->tc = this;
     }
 
+    // Copy kernel stats pointer from old context.
     thread->kernelStats = old_context->getKernelStats();
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
@@ -863,7 +912,11 @@ OzoneCPU<Impl>::OzoneTC::regStats(const std::string &name)
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneTC::serialize(std::ostream &os)
-{ }
+{
+    // Once serialization is added, serialize the quiesce event and
+    // kernel stats.  Will need to make sure there aren't multiple
+    // things that serialize them.
+}
 
 template <class Impl>
 void
@@ -896,16 +949,14 @@ template <class Impl>
 void
 OzoneCPU<Impl>::OzoneTC::profileClear()
 {
-    if (thread->profile)
-        thread->profile->clear();
+    thread->profileClear();
 }
 
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneTC::profileSample()
 {
-    if (thread->profile)
-        thread->profile->sample(thread->profileNode, thread->profilePC);
+    thread->profileSample();
 }
 #endif
 
@@ -916,7 +967,6 @@ OzoneCPU<Impl>::OzoneTC::getThreadNum()
     return thread->readTid();
 }
 
-// Also somewhat obnoxious.  Really only used for the TLB fault.
 template <class Impl>
 TheISA::MachInst
 OzoneCPU<Impl>::OzoneTC::getInst()
@@ -934,14 +984,20 @@ OzoneCPU<Impl>::OzoneTC::copyArchRegs(ThreadContext *tc)
     cpu->frontEnd->setPC(thread->PC);
     cpu->frontEnd->setNextPC(thread->nextPC);
 
-    for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
-        if (i < TheISA::FP_Base_DepTag) {
-            thread->renameTable[i]->setIntResult(tc->readIntReg(i));
-        } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) {
-            int fp_idx = i - TheISA::FP_Base_DepTag;
-            thread->renameTable[i]->setDoubleResult(
-                tc->readFloatReg(fp_idx, 64));
-        }
+    // First loop through the integer registers.
+    for (int i = 0; i < TheISA::NumIntRegs; ++i) {
+/*        DPRINTF(OzoneCPU, "Copying over register %i, had data %lli, "
+                "now has data %lli.\n",
+                i, thread->renameTable[i]->readIntResult(),
+                tc->readIntReg(i));
+*/
+        thread->renameTable[i]->setIntResult(tc->readIntReg(i));
+    }
+
+    // Then loop through the floating point registers.
+    for (int i = 0; i < TheISA::NumFloatRegs; ++i) {
+        int fp_idx = i + TheISA::FP_Base_DepTag;
+        thread->renameTable[fp_idx]->setIntResult(tc->readFloatRegBits(i));
     }
 
 #if !FULL_SYSTEM
diff --git a/src/cpu/ozone/dyn_inst_impl.hh b/src/cpu/ozone/dyn_inst_impl.hh
index ba0d70417..db1460eba 100644
--- a/src/cpu/ozone/dyn_inst_impl.hh
+++ b/src/cpu/ozone/dyn_inst_impl.hh
@@ -215,14 +215,14 @@ OzoneDynInst<Impl>::clearMemDependents()
 }
 
 template <class Impl>
-MiscReg
+TheISA::MiscReg
 OzoneDynInst<Impl>::readMiscReg(int misc_reg)
 {
     return this->thread->readMiscReg(misc_reg);
 }
 
 template <class Impl>
-MiscReg
+TheISA::MiscReg
 OzoneDynInst<Impl>::readMiscRegWithEffect(int misc_reg, Fault &fault)
 {
     return this->thread->readMiscRegWithEffect(misc_reg, fault);
diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh
index 3ed3c4d18..5ffd3666e 100644
--- a/src/cpu/ozone/front_end.hh
+++ b/src/cpu/ozone/front_end.hh
@@ -34,6 +34,7 @@
 #include <deque>
 
 #include "arch/utility.hh"
+#include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/bpred_unit.hh"
 #include "cpu/ozone/rename_table.hh"
@@ -246,15 +247,21 @@ class FrontEnd
     void dumpInsts();
 
   private:
+    TimeBuffer<int> numInstsReady;
+
     typedef typename std::deque<DynInstPtr> InstBuff;
     typedef typename InstBuff::iterator InstBuffIt;
 
+    InstBuff feBuffer;
+
     InstBuff instBuffer;
 
     int instBufferSize;
 
     int maxInstBufferSize;
 
+    int latency;
+
     int width;
 
     int freeRegs;
diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh
index 1b120460a..d34716de6 100644
--- a/src/cpu/ozone/front_end_impl.hh
+++ b/src/cpu/ozone/front_end_impl.hh
@@ -92,8 +92,10 @@ FrontEnd<Impl>::FrontEnd(Params *params)
     : branchPred(params),
       icachePort(this),
       mem(params->mem),
+      numInstsReady(params->frontEndLatency, 0),
       instBufferSize(0),
       maxInstBufferSize(params->maxInstBufferSize),
+      latency(params->frontEndLatency),
       width(params->frontEndWidth),
       freeRegs(params->numPhysicalRegs),
       numPhysRegs(params->numPhysicalRegs),
@@ -326,6 +328,18 @@ FrontEnd<Impl>::tick()
     if (switchedOut)
         return;
 
+    for (int insts_to_queue = numInstsReady[-latency];
+         !instBuffer.empty() && insts_to_queue;
+         --insts_to_queue)
+    {
+        DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n",
+                instBuffer.front()->seqNum);
+        feBuffer.push_back(instBuffer.front());
+        instBuffer.pop_front();
+    }
+
+    numInstsReady.advance();
+
     // @todo: Maybe I want to just have direct communication...
     if (fromCommit->doneSeqNum) {
         branchPred.update(fromCommit->doneSeqNum, 0);
@@ -339,8 +353,8 @@ FrontEnd<Impl>::tick()
         cacheBlkValid = true;
 
         status = Running;
-        if (barrierInst)
-            status = SerializeBlocked;
+//        if (barrierInst)
+//            status = SerializeBlocked;
         if (freeRegs <= 0)
             status = RenameBlocked;
         checkBE();
@@ -414,11 +428,12 @@ FrontEnd<Impl>::tick()
         // latency
         instBuffer.push_back(inst);
         ++instBufferSize;
+        numInstsReady[0]++;
         ++num_inst;
 
 #if FULL_SYSTEM
         if (inst->isQuiesce()) {
-            warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+//            warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
             status = QuiescePending;
             break;
         }
@@ -572,10 +587,10 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst)
 
         // Change status over to SerializeBlocked so that other stages know
         // what this is blocked on.
-        status = SerializeBlocked;
+//        status = SerializeBlocked;
 
-        barrierInst = inst;
-        return true;
+//        barrierInst = inst;
+//        return true;
     } else if ((inst->isStoreConditional() || inst->isSerializeAfter())
                && !inst->isSerializeHandled()) {
         DPRINTF(FE, "Serialize after instruction encountered.\n");
@@ -620,6 +635,7 @@ FrontEnd<Impl>::handleFault(Fault &fault)
     instruction->fault = fault;
     instruction->setCanIssue();
     instBuffer.push_back(instruction);
+    numInstsReady[0]++;
     ++instBufferSize;
 }
 
@@ -649,6 +665,21 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
         freeRegs+= inst->numDestRegs();
     }
 
+    while (!feBuffer.empty() &&
+           feBuffer.back()->seqNum > squash_num) {
+        DynInstPtr inst = feBuffer.back();
+
+        DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n",
+                inst->seqNum, inst->readPC());
+
+        inst->clearDependents();
+
+        feBuffer.pop_back();
+        --instBufferSize;
+
+        freeRegs+= inst->numDestRegs();
+    }
+
     // Copy over rename table from the back end.
     renameTable.copyFrom(backEnd->renameTable);
 
@@ -666,12 +697,12 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC,
         DPRINTF(FE, "Squashing outstanding Icache access.\n");
         memReq = NULL;
     }
-
+/*
     if (status == SerializeBlocked) {
         assert(barrierInst->seqNum > squash_num);
         barrierInst = NULL;
     }
-
+*/
     // Unless this squash originated from the front end, we're probably
     // in running mode now.
     // Actually might want to make this latency dependent.
@@ -683,13 +714,22 @@ template <class Impl>
 typename Impl::DynInstPtr
 FrontEnd<Impl>::getInst()
 {
-    if (instBufferSize == 0) {
+    if (feBuffer.empty()) {
         return NULL;
     }
 
-    DynInstPtr inst = instBuffer.front();
+    DynInstPtr inst = feBuffer.front();
 
-    instBuffer.pop_front();
+    if (inst->isSerializeBefore() || inst->isIprAccess()) {
+        DPRINTF(FE, "Back end is getting a serialize before inst\n");
+        if (!backEnd->robEmpty()) {
+            DPRINTF(FE, "Rob is not empty yet, not returning inst\n");
+            return NULL;
+        }
+        inst->clearSerializeBefore();
+    }
+
+    feBuffer.pop_front();
 
     --instBufferSize;
 
@@ -784,11 +824,11 @@ FrontEnd<Impl>::updateStatus()
     }
 
     if (status == BEBlocked && !be_block) {
-        if (barrierInst) {
-            status = SerializeBlocked;
-        } else {
+//        if (barrierInst) {
+//            status = SerializeBlocked;
+//        } else {
             status = Running;
-        }
+//        }
         ret_val = true;
     }
     return ret_val;
@@ -810,6 +850,7 @@ template <class Impl>
 typename Impl::DynInstPtr
 FrontEnd<Impl>::getInstFromCacheline()
 {
+/*
     if (status == SerializeComplete) {
         DynInstPtr inst = barrierInst;
         status = Running;
@@ -817,7 +858,7 @@ FrontEnd<Impl>::getInstFromCacheline()
         inst->clearSerializeBefore();
         return inst;
     }
-
+*/
     InstSeqNum inst_seq;
     MachInst inst;
     // @todo: Fix this magic number used here to handle word offset (and
@@ -932,6 +973,7 @@ FrontEnd<Impl>::doSwitchOut()
     squash(0, 0);
     instBuffer.clear();
     instBufferSize = 0;
+    feBuffer.clear();
     status = Idle;
 }
 
diff --git a/src/cpu/ozone/inorder_back_end_impl.hh b/src/cpu/ozone/inorder_back_end_impl.hh
index 701fc0ee9..16ebac163 100644
--- a/src/cpu/ozone/inorder_back_end_impl.hh
+++ b/src/cpu/ozone/inorder_back_end_impl.hh
@@ -284,7 +284,7 @@ InorderBackEnd<Impl>::executeInsts()
         }
 
         inst->setExecuted();
-        inst->setCompleted();
+        inst->setResultReady();
         inst->setCanCommit();
 
         instList.pop_front();
diff --git a/src/cpu/ozone/inst_queue_impl.hh b/src/cpu/ozone/inst_queue_impl.hh
index f2d80e621..32a940241 100644
--- a/src/cpu/ozone/inst_queue_impl.hh
+++ b/src/cpu/ozone/inst_queue_impl.hh
@@ -850,13 +850,13 @@ template <class Impl>
 void
 InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
 {
-    OpClass op_class = ready_inst->opClass();
+//    OpClass op_class = ready_inst->opClass();
 
     readyInsts.push(ready_inst);
 
     DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
             "the ready list, PC %#x opclass:%i [sn:%lli].\n",
-            ready_inst->readPC(), op_class, ready_inst->seqNum);
+            ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum);
 }
 /*
 template <class Impl>
@@ -1177,11 +1177,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst)
             return;
         }
 
-        OpClass op_class = inst->opClass();
+//        OpClass op_class = inst->opClass();
 
         DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
                 "the ready list, PC %#x opclass:%i [sn:%lli].\n",
-                inst->readPC(), op_class, inst->seqNum);
+                inst->readPC(), inst->opClass(), inst->seqNum);
 
         readyInsts.push(inst);
     }
diff --git a/src/cpu/ozone/lw_back_end.hh b/src/cpu/ozone/lw_back_end.hh
index d836ceebd..49c6a1ae2 100644
--- a/src/cpu/ozone/lw_back_end.hh
+++ b/src/cpu/ozone/lw_back_end.hh
@@ -80,7 +80,7 @@ class LWBackEnd
     TimeBuffer<IssueToExec> i2e;
     typename TimeBuffer<IssueToExec>::wire instsToExecute;
     TimeBuffer<ExecToCommit> e2c;
-    TimeBuffer<Writeback> numInstsToWB;
+    TimeBuffer<int> numInstsToWB;
 
     TimeBuffer<CommStruct> *comm;
     typename TimeBuffer<CommStruct>::wire toIEW;
@@ -139,7 +139,7 @@ class LWBackEnd
 
     Tick lastCommitCycle;
 
-    bool robEmpty() { return instList.empty(); }
+    bool robEmpty() { return numInsts == 0; }
 
     bool isFull() { return numInsts >= numROBEntries; }
     bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; }
@@ -194,6 +194,7 @@ class LWBackEnd
     }
 
     void instToCommit(DynInstPtr &inst);
+    void readyInstsForCommit();
 
     void switchOut();
     void doSwitchOut();
@@ -255,12 +256,13 @@ class LWBackEnd
 
     RenameTable<Impl> renameTable;
   private:
+    int latency;
+
     // General back end width. Used if the more specific isn't given.
     int width;
 
     // Dispatch width.
     int dispatchWidth;
-    int numDispatchEntries;
     int dispatchSize;
 
     int waitingInsts;
@@ -285,6 +287,7 @@ class LWBackEnd
 
     int numROBEntries;
     int numInsts;
+    bool lsqLimits;
 
     std::set<InstSeqNum> waitingMemOps;
     typedef std::set<InstSeqNum>::iterator MemIt;
@@ -295,9 +298,6 @@ class LWBackEnd
     InstSeqNum squashSeqNum;
     Addr squashNextPC;
 
-    Fault faultFromFetch;
-    bool fetchHasFault;
-
     bool switchedOut;
     bool switchPending;
 
@@ -321,8 +321,6 @@ class LWBackEnd
     std::list<DynInstPtr> replayList;
     std::list<DynInstPtr> writeback;
 
-    int latency;
-
     int squashLatency;
 
     bool exactFullStall;
@@ -331,37 +329,39 @@ class LWBackEnd
 /*    Stats::Scalar<> dcacheStallCycles;
       Counter lastDcacheStall;
 */
-    Stats::Vector<> rob_cap_events;
-    Stats::Vector<> rob_cap_inst_count;
-    Stats::Vector<> iq_cap_events;
-    Stats::Vector<> iq_cap_inst_count;
+    Stats::Vector<> robCapEvents;
+    Stats::Vector<> robCapInstCount;
+    Stats::Vector<> iqCapEvents;
+    Stats::Vector<> iqCapInstCount;
     // total number of instructions executed
-    Stats::Vector<> exe_inst;
-    Stats::Vector<> exe_swp;
-    Stats::Vector<> exe_nop;
-    Stats::Vector<> exe_refs;
-    Stats::Vector<> exe_loads;
-    Stats::Vector<> exe_branches;
+    Stats::Vector<> exeInst;
+    Stats::Vector<> exeSwp;
+    Stats::Vector<> exeNop;
+    Stats::Vector<> exeRefs;
+    Stats::Vector<> exeLoads;
+    Stats::Vector<> exeBranches;
 
-    Stats::Vector<> issued_ops;
+    Stats::Vector<> issuedOps;
 
     // total number of loads forwaded from LSQ stores
-    Stats::Vector<> lsq_forw_loads;
+    Stats::Vector<> lsqForwLoads;
 
     // total number of loads ignored due to invalid addresses
-    Stats::Vector<> inv_addr_loads;
+    Stats::Vector<> invAddrLoads;
 
     // total number of software prefetches ignored due to invalid addresses
-    Stats::Vector<> inv_addr_swpfs;
+    Stats::Vector<> invAddrSwpfs;
     // ready loads blocked due to memory disambiguation
-    Stats::Vector<> lsq_blocked_loads;
+    Stats::Vector<> lsqBlockedLoads;
 
     Stats::Scalar<> lsqInversion;
 
-    Stats::Vector<> n_issued_dist;
-    Stats::VectorDistribution<> issue_delay_dist;
+    Stats::Vector<> nIssuedDist;
+/*
+    Stats::VectorDistribution<> issueDelayDist;
 
-    Stats::VectorDistribution<> queue_res_dist;
+    Stats::VectorDistribution<> queueResDist;
+*/
 /*
     Stats::Vector<> stat_fu_busy;
     Stats::Vector2d<> stat_fuBusy;
@@ -379,37 +379,37 @@ class LWBackEnd
     Stats::Formula commit_ipb;
     Stats::Formula lsq_inv_rate;
 */
-    Stats::Vector<> writeback_count;
-    Stats::Vector<> producer_inst;
-    Stats::Vector<> consumer_inst;
-    Stats::Vector<> wb_penalized;
+    Stats::Vector<> writebackCount;
+    Stats::Vector<> producerInst;
+    Stats::Vector<> consumerInst;
+    Stats::Vector<> wbPenalized;
 
-    Stats::Formula wb_rate;
-    Stats::Formula wb_fanout;
-    Stats::Formula wb_penalized_rate;
+    Stats::Formula wbRate;
+    Stats::Formula wbFanout;
+    Stats::Formula wbPenalizedRate;
 
     // total number of instructions committed
-    Stats::Vector<> stat_com_inst;
-    Stats::Vector<> stat_com_swp;
-    Stats::Vector<> stat_com_refs;
-    Stats::Vector<> stat_com_loads;
-    Stats::Vector<> stat_com_membars;
-    Stats::Vector<> stat_com_branches;
+    Stats::Vector<> statComInst;
+    Stats::Vector<> statComSwp;
+    Stats::Vector<> statComRefs;
+    Stats::Vector<> statComLoads;
+    Stats::Vector<> statComMembars;
+    Stats::Vector<> statComBranches;
 
-    Stats::Distribution<> n_committed_dist;
+    Stats::Distribution<> nCommittedDist;
 
-    Stats::Scalar<> commit_eligible_samples;
-    Stats::Vector<> commit_eligible;
+    Stats::Scalar<> commitEligibleSamples;
+    Stats::Vector<> commitEligible;
 
     Stats::Vector<> squashedInsts;
     Stats::Vector<> ROBSquashedInsts;
 
-    Stats::Scalar<> ROB_fcount;
-    Stats::Formula ROB_full_rate;
+    Stats::Scalar<> ROBFcount;
+    Stats::Formula ROBFullRate;
 
-    Stats::Vector<>  ROB_count;	 // cumulative ROB occupancy
-    Stats::Formula ROB_occ_rate;
-    Stats::VectorDistribution<> ROB_occ_dist;
+    Stats::Vector<>  ROBCount;	 // cumulative ROB occupancy
+    Stats::Formula ROBOccRate;
+//    Stats::VectorDistribution<> ROBOccDist;
   public:
     void dumpInsts();
 
diff --git a/src/cpu/ozone/lw_back_end_impl.hh b/src/cpu/ozone/lw_back_end_impl.hh
index a4f1d805e..c39b9e08b 100644
--- a/src/cpu/ozone/lw_back_end_impl.hh
+++ b/src/cpu/ozone/lw_back_end_impl.hh
@@ -141,13 +141,14 @@ LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst)
 
 template <class Impl>
 LWBackEnd<Impl>::LWBackEnd(Params *params)
-    : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5),
+    : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0),
       trapSquash(false), tcSquash(false),
-      width(params->backEndWidth), exactFullStall(true)
+      latency(params->backEndLatency),
+      width(params->backEndWidth), lsqLimits(params->lsqLimits),
+      exactFullStall(true)
 {
     numROBEntries = params->numROBEntries;
     numInsts = 0;
-    numDispatchEntries = 32;
     maxOutstandingMemOps = params->maxOutstandingMemOps;
     numWaitingMemOps = 0;
     waitingInsts = 0;
@@ -184,78 +185,79 @@ void
 LWBackEnd<Impl>::regStats()
 {
     using namespace Stats;
-    rob_cap_events
+    LSQ.regStats();
+
+    robCapEvents
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:cap_events")
         .desc("number of cycles where ROB cap was active")
         .flags(total)
         ;
 
-    rob_cap_inst_count
+    robCapInstCount
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:cap_inst")
         .desc("number of instructions held up by ROB cap")
         .flags(total)
         ;
 
-    iq_cap_events
+    iqCapEvents
         .init(cpu->number_of_threads)
         .name(name() +".IQ:cap_events" )
         .desc("number of cycles where IQ cap was active")
         .flags(total)
         ;
 
-    iq_cap_inst_count
+    iqCapInstCount
         .init(cpu->number_of_threads)
         .name(name() + ".IQ:cap_inst")
         .desc("number of instructions held up by IQ cap")
         .flags(total)
         ;
 
-
-    exe_inst
+    exeInst
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:count")
         .desc("number of insts issued")
         .flags(total)
         ;
 
-    exe_swp
+    exeSwp
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:swp")
         .desc("number of swp insts issued")
         .flags(total)
         ;
 
-    exe_nop
+    exeNop
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:nop")
         .desc("number of nop insts issued")
         .flags(total)
         ;
 
-    exe_refs
+    exeRefs
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:refs")
         .desc("number of memory reference insts issued")
         .flags(total)
         ;
 
-    exe_loads
+    exeLoads
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:loads")
         .desc("number of load insts issued")
         .flags(total)
         ;
 
-    exe_branches
+    exeBranches
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:branches")
         .desc("Number of branches issued")
         .flags(total)
         ;
 
-    issued_ops
+    issuedOps
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:op_count")
         .desc("number of insts issued")
@@ -272,28 +274,28 @@ LWBackEnd<Impl>::regStats()
     //
     //  Other stats
     //
-    lsq_forw_loads
+    lsqForwLoads
         .init(cpu->number_of_threads)
         .name(name() + ".LSQ:forw_loads")
         .desc("number of loads forwarded via LSQ")
         .flags(total)
         ;
 
-    inv_addr_loads
+    invAddrLoads
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:addr_loads")
         .desc("number of invalid-address loads")
         .flags(total)
         ;
 
-    inv_addr_swpfs
+    invAddrSwpfs
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:addr_swpfs")
         .desc("number of invalid-address SW prefetches")
         .flags(total)
         ;
 
-    lsq_blocked_loads
+    lsqBlockedLoads
         .init(cpu->number_of_threads)
         .name(name() + ".LSQ:blocked_loads")
         .desc("number of ready loads not issued due to memory disambiguation")
@@ -305,51 +307,52 @@ LWBackEnd<Impl>::regStats()
         .desc("Number of times LSQ instruction issued early")
         ;
 
-    n_issued_dist
+    nIssuedDist
         .init(issueWidth + 1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
         .flags(total | pdf | dist)
         ;
-    issue_delay_dist
+/*
+    issueDelayDist
         .init(Num_OpClasses,0,99,2)
         .name(name() + ".ISSUE:")
         .desc("cycles from operands ready to issue")
         .flags(pdf | cdf)
         ;
 
-    queue_res_dist
+    queueResDist
         .init(Num_OpClasses, 0, 99, 2)
         .name(name() + ".IQ:residence:")
         .desc("cycles from dispatch to issue")
         .flags(total | pdf | cdf )
         ;
     for (int i = 0; i < Num_OpClasses; ++i) {
-        queue_res_dist.subname(i, opClassStrings[i]);
+        queueResDist.subname(i, opClassStrings[i]);
     }
-
-    writeback_count
+*/
+    writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
         .flags(total)
         ;
 
-    producer_inst
+    producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
         .flags(total)
         ;
 
-    consumer_inst
+    consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
         .flags(total)
         ;
 
-    wb_penalized
+    wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
@@ -357,71 +360,71 @@ LWBackEnd<Impl>::regStats()
         ;
 
 
-    wb_penalized_rate
+    wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate = wb_penalized / writeback_count;
+    wbPenalizedRate = wbPenalized / writebackCount;
 
-    wb_fanout
+    wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
         .flags(total)
         ;
 
-    wb_fanout = producer_inst / consumer_inst;
+    wbFanout = producerInst / consumerInst;
 
-    wb_rate
+    wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
         .flags(total)
         ;
-    wb_rate = writeback_count / cpu->numCycles;
+    wbRate = writebackCount / cpu->numCycles;
 
-    stat_com_inst
+    statComInst
         .init(cpu->number_of_threads)
         .name(name() + ".COM:count")
         .desc("Number of instructions committed")
         .flags(total)
         ;
 
-    stat_com_swp
+    statComSwp
         .init(cpu->number_of_threads)
         .name(name() + ".COM:swp_count")
         .desc("Number of s/w prefetches committed")
         .flags(total)
         ;
 
-    stat_com_refs
+    statComRefs
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:refs")
         .desc("Number of memory references committed")
         .flags(total)
         ;
 
-    stat_com_loads
+    statComLoads
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:loads")
         .desc("Number of loads committed")
         .flags(total)
         ;
 
-    stat_com_membars
+    statComMembars
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:membars")
         .desc("Number of memory barriers committed")
         .flags(total)
         ;
 
-    stat_com_branches
+    statComBranches
         .init(cpu->number_of_threads)
         .name(name() + ".COM:branches")
         .desc("Number of branches committed")
         .flags(total)
         ;
-    n_committed_dist
+    nCommittedDist
         .init(0,commitWidth,1)
         .name(name() + ".COM:committed_per_cycle")
         .desc("Number of insts commited each cycle")
@@ -441,14 +444,14 @@ LWBackEnd<Impl>::regStats()
     //  -> The standard deviation is computed only over cycles where
     //  we reached the BW limit
     //
-    commit_eligible
+    commitEligible
         .init(cpu->number_of_threads)
         .name(name() + ".COM:bw_limited")
         .desc("number of insts not committed due to BW limits")
         .flags(total)
         ;
 
-    commit_eligible_samples
+    commitEligibleSamples
         .name(name() + ".COM:bw_lim_events")
         .desc("number cycles where commit BW limit reached")
         ;
@@ -465,37 +468,38 @@ LWBackEnd<Impl>::regStats()
         .desc("Number of instructions removed from inst list when they reached the head of the ROB")
         ;
 
-    ROB_fcount
+    ROBFcount
         .name(name() + ".ROB:full_count")
         .desc("number of cycles where ROB was full")
         ;
 
-    ROB_count
+    ROBCount
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:occupancy")
         .desc(name() + ".ROB occupancy (cumulative)")
         .flags(total)
         ;
 
-    ROB_full_rate
+    ROBFullRate
         .name(name() + ".ROB:full_rate")
         .desc("ROB full per cycle")
         ;
-    ROB_full_rate = ROB_fcount / cpu->numCycles;
+    ROBFullRate = ROBFcount / cpu->numCycles;
 
-    ROB_occ_rate
+    ROBOccRate
         .name(name() + ".ROB:occ_rate")
         .desc("ROB occupancy rate")
         .flags(total)
         ;
-    ROB_occ_rate = ROB_count / cpu->numCycles;
-
-    ROB_occ_dist
+    ROBOccRate = ROBCount / cpu->numCycles;
+/*
+    ROBOccDist
         .init(cpu->number_of_threads,0,numROBEntries,2)
         .name(name() + ".ROB:occ_dist")
         .desc("ROB Occupancy per cycle")
         .flags(total | cdf)
         ;
+*/
 }
 
 template <class Impl>
@@ -588,17 +592,21 @@ LWBackEnd<Impl>::tick()
 {
     DPRINTF(BE, "Ticking back end\n");
 
+    // Read in any done instruction information and update the IQ or LSQ.
+    updateStructures();
+
     if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) {
         cpu->signalSwitched();
         return;
     }
 
-    ROB_count[0]+= numInsts;
+    readyInstsForCommit();
 
-    wbCycle = 0;
+    numInstsToWB.advance();
 
-    // Read in any done instruction information and update the IQ or LSQ.
-    updateStructures();
+    ROBCount[0]+= numInsts;
+
+    wbCycle = 0;
 
 #if FULL_SYSTEM
     checkInterrupts();
@@ -674,6 +682,10 @@ LWBackEnd<Impl>::dispatchInsts()
     while (numInsts < numROBEntries &&
            numWaitingMemOps < maxOutstandingMemOps) {
         // Get instruction from front of time buffer
+        if (lsqLimits && LSQ.isFull()) {
+            break;
+        }
+
         DynInstPtr inst = frontEnd->getInst();
         if (!inst) {
             break;
@@ -732,6 +744,7 @@ LWBackEnd<Impl>::dispatchInsts()
                 inst->setIssued();
                 inst->setExecuted();
                 inst->setCanCommit();
+                numInstsToWB[0]++;
             } else {
                 DPRINTF(BE, "Instruction [sn:%lli] ready, addding to "
                         "exeList.\n",
@@ -866,8 +879,17 @@ LWBackEnd<Impl>::executeInsts()
             if (inst->isLoad()) {
                 LSQ.executeLoad(inst);
             } else if (inst->isStore()) {
-                LSQ.executeStore(inst);
-                if (inst->req && !(inst->req->getFlags() & LOCKED)) {
+                Fault fault = LSQ.executeStore(inst);
+
+                if (!inst->isStoreConditional() && fault == NoFault) {
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                } else if (fault != NoFault) {
+                    // If the instruction faulted, then we need to send it along to commit
+                    // without the instruction completing.
+                    // Send this instruction to commit, also make sure iew stage
+                    // realizes there is activity.
                     inst->setExecuted();
 
                     instToCommit(inst);
@@ -908,36 +930,54 @@ LWBackEnd<Impl>::executeInsts()
         }
     }
 
-    issued_ops[0]+= num_executed;
-    n_issued_dist[num_executed]++;
+    issuedOps[0]+= num_executed;
+    nIssuedDist[num_executed]++;
 }
 
 template<class Impl>
 void
 LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
 {
-
     DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n",
             inst->seqNum, inst->readPC());
 
     if (!inst->isSquashed()) {
-        DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
-                inst->seqNum, inst->readPC());
-
-        inst->setCanCommit();
-
         if (inst->isExecuted()) {
             inst->setResultReady();
             int dependents = wakeDependents(inst);
             if (dependents) {
-                producer_inst[0]++;
-                consumer_inst[0]+= dependents;
+                producerInst[0]++;
+                consumerInst[0]+= dependents;
             }
         }
     }
 
-    writeback_count[0]++;
+    writeback.push_back(inst);
+
+    numInstsToWB[0]++;
+
+    writebackCount[0]++;
+}
+
+template <class Impl>
+void
+LWBackEnd<Impl>::readyInstsForCommit()
+{
+    for (int i = numInstsToWB[-latency];
+         !writeback.empty() && i;
+         --i)
+    {
+        DynInstPtr inst = writeback.front();
+        writeback.pop_front();
+        if (!inst->isSquashed()) {
+            DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n",
+                    inst->seqNum, inst->readPC());
+
+            inst->setCanCommit();
+        }
+    }
 }
+
 #if 0
 template <class Impl>
 void
@@ -1010,7 +1050,7 @@ LWBackEnd<Impl>::commitInst(int inst_num)
     // or store inst.  Signal backwards that it should be executed.
     if (!inst->isExecuted()) {
         if (inst->isNonSpeculative() ||
-            inst->isStoreConditional() ||
+            (inst->isStoreConditional() && inst->getFault() == NoFault) ||
             inst->isMemBarrier() ||
             inst->isWriteBarrier()) {
 #if !FULL_SYSTEM
@@ -1151,6 +1191,20 @@ LWBackEnd<Impl>::commitInst(int inst_num)
         ++freed_regs;
     }
 
+#if FULL_SYSTEM
+    if (thread->profile) {
+//        bool usermode =
+//            (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0;
+//        thread->profilePC = usermode ? 1 : inst->readPC();
+        thread->profilePC = inst->readPC();
+        ProfileNode *node = thread->profile->consume(thread->getTC(),
+                                                     inst->staticInst);
+
+        if (node)
+            thread->profileNode = node;
+    }
+#endif
+
     if (inst->traceData) {
         inst->traceData->setFetchSeq(inst->seqNum);
         inst->traceData->setCPSeq(thread->numInst);
@@ -1158,6 +1212,9 @@ LWBackEnd<Impl>::commitInst(int inst_num)
         inst->traceData = NULL;
     }
 
+    if (inst->isCopy())
+        panic("Should not commit any copy instructions!");
+
     inst->clearDependents();
 
     frontEnd->addFreeRegs(freed_regs);
@@ -1207,9 +1264,9 @@ LWBackEnd<Impl>::commitInsts()
     while (!instList.empty() && inst_num < commitWidth) {
         if (instList.back()->isSquashed()) {
             instList.back()->clearDependents();
+            ROBSquashedInsts[instList.back()->threadNumber]++;
             instList.pop_back();
             --numInsts;
-            ROBSquashedInsts[instList.back()->threadNumber]++;
             continue;
         }
 
@@ -1221,7 +1278,7 @@ LWBackEnd<Impl>::commitInsts()
             break;
         }
     }
-    n_committed_dist.sample(inst_num);
+    nCommittedDist.sample(inst_num);
 }
 
 template <class Impl>
@@ -1231,10 +1288,10 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
     LSQ.squash(sn);
 
     int freed_regs = 0;
-    InstListIt waiting_list_end = waitingList.end();
+    InstListIt insts_end_it = waitingList.end();
     InstListIt insts_it = waitingList.begin();
 
-    while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn)
+    while (insts_it != insts_end_it && (*insts_it)->seqNum > sn)
     {
         if ((*insts_it)->isSquashed()) {
             ++insts_it;
@@ -1260,6 +1317,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
     while (!instList.empty() && (*insts_it)->seqNum > sn)
     {
         if ((*insts_it)->isSquashed()) {
+            panic("Instruction should not be already squashed and on list!");
             ++insts_it;
             continue;
         }
@@ -1291,18 +1349,6 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
         --numInsts;
     }
 
-    insts_it = waitingList.begin();
-    while (!waitingList.empty() && insts_it != waitingList.end()) {
-        if ((*insts_it)->seqNum < sn) {
-            ++insts_it;
-            continue;
-        }
-        assert((*insts_it)->isSquashed());
-
-        waitingList.erase(insts_it++);
-        waitingInsts--;
-    }
-
     while (memBarrier && memBarrier->seqNum > sn) {
         DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously "
                 "squashed)\n", memBarrier->seqNum);
@@ -1320,6 +1366,18 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
         }
     }
 
+    insts_it = replayList.begin();
+    insts_end_it = replayList.end();
+    while (!replayList.empty() && insts_it != insts_end_it) {
+        if ((*insts_it)->seqNum < sn) {
+            ++insts_it;
+            continue;
+        }
+        assert((*insts_it)->isSquashed());
+
+        replayList.erase(insts_it++);
+    }
+
     frontEnd->addFreeRegs(freed_regs);
 }
 
@@ -1392,14 +1450,6 @@ LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst)
 
 template <class Impl>
 void
-LWBackEnd<Impl>::fetchFault(Fault &fault)
-{
-    faultFromFetch = fault;
-    fetchHasFault = true;
-}
-
-template <class Impl>
-void
 LWBackEnd<Impl>::switchOut()
 {
     switchPending = true;
@@ -1416,17 +1466,25 @@ LWBackEnd<Impl>::doSwitchOut()
     // yet written back.
     assert(robEmpty());
     assert(!LSQ.hasStoresToWB());
-
+    writeback.clear();
+    for (int i = 0; i < numInstsToWB.getSize() + 1; ++i)
+        numInstsToWB.advance();
+
+//    squash(0);
+    assert(waitingList.empty());
+    assert(instList.empty());
+    assert(replayList.empty());
+    assert(writeback.empty());
     LSQ.switchOut();
-
-    squash(0);
 }
 
 template <class Impl>
 void
 LWBackEnd<Impl>::takeOverFrom(ThreadContext *old_tc)
 {
-    switchedOut = false;
+    assert(!squashPending);
+    squashSeqNum = 0;
+    squashNextPC = 0;
     tcSquash = false;
     trapSquash = false;
 
@@ -1451,27 +1509,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exe_swp[thread_number]++;
+        exeSwp[thread_number]++;
     else
-        exe_inst[thread_number]++;
+        exeInst[thread_number]++;
 #else
-    exe_inst[thread_number]++;
+    exeInst[thread_number]++;
 #endif
 
     //
     //  Control operations
     //
     if (inst->isControl())
-        exe_branches[thread_number]++;
+        exeBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exe_refs[thread_number]++;
+        exeRefs[thread_number]++;
 
         if (inst->isLoad())
-            exe_loads[thread_number]++;
+            exeLoads[thread_number]++;
     }
 }
 
@@ -1491,33 +1549,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch()) {
-        stat_com_swp[tid]++;
+        statComSwp[tid]++;
     } else {
-        stat_com_inst[tid]++;
+        statComInst[tid]++;
     }
 #else
-    stat_com_inst[tid]++;
+    statComInst[tid]++;
 #endif
 
     //
     //  Control Instructions
     //
     if (inst->isControl())
-        stat_com_branches[tid]++;
+        statComBranches[tid]++;
 
     //
     //  Memory references
     //
     if (inst->isMemRef()) {
-        stat_com_refs[tid]++;
+        statComRefs[tid]++;
 
         if (inst->isLoad()) {
-            stat_com_loads[tid]++;
+            statComLoads[tid]++;
         }
     }
 
     if (inst->isMemBarrier()) {
-        stat_com_membars[tid]++;
+        statComMembars[tid]++;
     }
 }
 
@@ -1569,6 +1627,45 @@ LWBackEnd<Impl>::dumpInsts()
         ++num;
     }
 
+    inst_list_it = --(writeback.end());
+
+    cprintf("Writeback list size: %i\n", writeback.size());
+
+    while (inst_list_it != writeback.end())
+    {
+        cprintf("Instruction:%i\n",
+                num);
+        if (!(*inst_list_it)->isSquashed()) {
+            if (!(*inst_list_it)->isIssued()) {
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            } else if ((*inst_list_it)->isMemRef() &&
+                       !(*inst_list_it)->memOpDone) {
+                // Loads that have not been marked as executed still count
+                // towards the total instructions.
+                ++valid_num;
+                cprintf("Count:%i\n", valid_num);
+            }
+        }
+
+        cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                "Issued:%i\nSquashed:%i\n",
+                (*inst_list_it)->readPC(),
+                (*inst_list_it)->seqNum,
+                (*inst_list_it)->threadNumber,
+                (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
+
+        if ((*inst_list_it)->isMemRef()) {
+            cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+        }
+
+        cprintf("\n");
+
+        inst_list_it--;
+        ++num;
+    }
+
     cprintf("Waiting list size: %i\n", waitingList.size());
 
     inst_list_it = --(waitingList.end());
diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh
index 9a21a9d01..6640a9f34 100644
--- a/src/cpu/ozone/lw_lsq.hh
+++ b/src/cpu/ozone/lw_lsq.hh
@@ -84,6 +84,8 @@ class OzoneLWLSQ {
     /** Returns the name of the LSQ unit. */
     std::string name() const;
 
+    void regStats();
+
     /** Sets the CPU pointer. */
     void setCPU(OzoneCPU *cpu_ptr);
 
@@ -179,7 +181,7 @@ class OzoneLWLSQ {
     int numLoads() { return loads; }
 
     /** Returns the number of stores in the SQ. */
-    int numStores() { return stores; }
+    int numStores() { return stores + storesInFlight; }
 
     /** Returns if either the LQ or SQ is full. */
     bool isFull() { return lqFull() || sqFull(); }
@@ -188,7 +190,7 @@ class OzoneLWLSQ {
     bool lqFull() { return loads >= (LQEntries - 1); }
 
     /** Returns if the SQ is full. */
-    bool sqFull() { return stores >= (SQEntries - 1); }
+    bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); }
 
     /** Debugging function to dump instructions in the LSQ. */
     void dumpInsts();
@@ -223,7 +225,9 @@ class OzoneLWLSQ {
     void storePostSend(Packet *pkt, DynInstPtr &inst);
 
     /** Completes the store at the specified index. */
-    void completeStore(int store_idx);
+    void completeStore(DynInstPtr &inst);
+
+    void removeStore(int store_idx);
 
     /** Handles doing the retry. */
     void recvRetry();
@@ -394,6 +398,10 @@ class OzoneLWLSQ {
 
     int storesToWB;
 
+  public:
+    int storesInFlight;
+
+  private:
     /// @todo Consider moving to a more advanced model with write vs read ports
     /** The number of cache ports available each cycle. */
     int cachePorts;
@@ -403,6 +411,9 @@ class OzoneLWLSQ {
 
     //list<InstSeqNum> mshrSeqNums;
 
+    /** Tota number of memory ordering violations. */
+    Stats::Scalar<> lsqMemOrderViolation;
+
      //Stats::Scalar<> dcacheStallCycles;
     Counter lastDcacheStall;
 
@@ -525,7 +536,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
 
         store_size = (*sq_it).size;
 
-        if (store_size == 0) {
+        if (store_size == 0 || (*sq_it).committed) {
             sq_it++;
             continue;
         }
diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh
index 7eef4b11f..4c96ad149 100644
--- a/src/cpu/ozone/lw_lsq_impl.hh
+++ b/src/cpu/ozone/lw_lsq_impl.hh
@@ -121,7 +121,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt)
         }
 
         if (inst->isStore()) {
-            completeStore(state->idx);
+            completeStore(inst);
         }
     }
 
@@ -132,7 +132,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt)
 template <class Impl>
 OzoneLWLSQ<Impl>::OzoneLWLSQ()
     : switchedOut(false), dcachePort(this), loads(0), stores(0),
-      storesToWB(0), stalled(false), isStoreBlocked(false),
+      storesToWB(0), storesInFlight(0), stalled(false), isStoreBlocked(false),
       isLoadBlocked(false), loadBlockedHandled(false)
 {
 }
@@ -173,6 +173,15 @@ OzoneLWLSQ<Impl>::name() const
 
 template<class Impl>
 void
+OzoneLWLSQ<Impl>::regStats()
+{
+    lsqMemOrderViolation
+        .name(name() + ".memOrderViolation")
+        .desc("Number of memory ordering violations");
+}
+
+template<class Impl>
+void
 OzoneLWLSQ<Impl>::setCPU(OzoneCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
@@ -321,7 +330,7 @@ unsigned
 OzoneLWLSQ<Impl>::numFreeEntries()
 {
     unsigned free_lq_entries = LQEntries - loads;
-    unsigned free_sq_entries = SQEntries - stores;
+    unsigned free_sq_entries = SQEntries - (stores + storesInFlight);
 
     // Both the LQ and SQ entries have an extra dummy entry to differentiate
     // empty/full conditions.  Subtract 1 from the free entries.
@@ -385,6 +394,9 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst)
     // Actually probably want the oldest faulting load
     if (load_fault != NoFault) {
         DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum);
+        if (!(inst->req->getFlags() & UNCACHEABLE && !inst->isAtCommit())) {
+            inst->setExecuted();
+        }
         // Maybe just set it as can commit here, although that might cause
         // some other problems with sending traps to the ROB too quickly.
         be->instToCommit(inst);
@@ -461,6 +473,7 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst)
                 // A load incorrectly passed this store.  Squash and refetch.
                 // For now return a fault to show that it was unsuccessful.
                 memDepViolator = (*lq_it);
+                ++lsqMemOrderViolation;
 
                 return TheISA::genMachineCheckFault();
             }
@@ -553,8 +566,8 @@ OzoneLWLSQ<Impl>::writebackStores()
 
         if ((*sq_it).size == 0 && !(*sq_it).completed) {
             sq_it--;
-            completeStore(inst->sqIdx);
-
+            removeStore(inst->sqIdx);
+            completeStore(inst);
             continue;
         }
 
@@ -626,6 +639,8 @@ OzoneLWLSQ<Impl>::writebackStores()
                 inst->sqIdx,inst->readPC(),
                 req->paddr, *(req->data),
                 inst->seqNum);
+        DPRINTF(OzoneLSQ, "StoresInFlight: %i\n",
+                storesInFlight + 1);
 
         if (dcacheInterface) {
             assert(!req->completionEvent);
@@ -687,6 +702,8 @@ OzoneLWLSQ<Impl>::writebackStores()
                 }
                 sq_it--;
             }
+            ++storesInFlight;
+//            removeStore(inst->sqIdx);
         } else {
             panic("Must HAVE DCACHE!!!!!\n");
         }
@@ -704,7 +721,7 @@ void
 OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num)
 {
     DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!"
-            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight);
 
 
     LQIt lq_it = loadQueue.begin();
@@ -881,7 +898,7 @@ OzoneLWLSQ<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt)
 
 template <class Impl>
 void
-OzoneLWLSQ<Impl>::completeStore(int store_idx)
+OzoneLWLSQ<Impl>::removeStore(int store_idx)
 {
     SQHashIt sq_hash_it = SQItHash.find(store_idx);
     assert(sq_hash_it != SQItHash.end());
@@ -891,8 +908,6 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
     (*sq_it).completed = true;
     DynInstPtr inst = (*sq_it).inst;
 
-    --storesToWB;
-
     if (isStalled() &&
         inst->seqNum == stallingStoreIsn) {
         DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] "
@@ -910,6 +925,13 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx)
     SQItHash.erase(sq_hash_it);
     SQIndices.push(inst->sqIdx);
     storeQueue.erase(sq_it);
+}
+
+template <class Impl>
+void
+OzoneLWLSQ<Impl>::completeStore(DynInstPtr &inst)
+{
+    --storesToWB;
     --stores;
 
     inst->setCompleted();
@@ -935,9 +957,14 @@ OzoneLWLSQ<Impl>::switchOut()
     switchedOut = true;
 
     // Clear the queue to free up resources
+    assert(stores == 0);
+    assert(storeQueue.empty());
+    assert(loads == 0);
+    assert(loadQueue.empty());
+    assert(storesInFlight == 0);
     storeQueue.clear();
     loadQueue.clear();
-    loads = stores = storesToWB = 0;
+    loads = stores = storesToWB = storesInFlight = 0;
 }
 
 template <class Impl>
diff --git a/src/cpu/ozone/simple_params.hh b/src/cpu/ozone/simple_params.hh
index 11cee716f..3f63d2e1d 100644
--- a/src/cpu/ozone/simple_params.hh
+++ b/src/cpu/ozone/simple_params.hh
@@ -71,10 +71,11 @@ class SimpleParams : public BaseCPU::Params
 
     unsigned cachePorts;
     unsigned width;
+    unsigned frontEndLatency;
     unsigned frontEndWidth;
+    unsigned backEndLatency;
     unsigned backEndWidth;
     unsigned backEndSquashLatency;
-    unsigned backEndLatency;
     unsigned maxInstBufferSize;
     unsigned numPhysicalRegs;
     unsigned maxOutstandingMemOps;
@@ -150,6 +151,7 @@ class SimpleParams : public BaseCPU::Params
     //
     unsigned LQEntries;
     unsigned SQEntries;
+    bool lsqLimits;
 
     //
     // Memory dependence
diff --git a/src/cpu/ozone/thread_state.hh b/src/cpu/ozone/thread_state.hh
index 8234cf938..c86f3552e 100644
--- a/src/cpu/ozone/thread_state.hh
+++ b/src/cpu/ozone/thread_state.hh
@@ -34,9 +34,12 @@
 #include "arch/faults.hh"
 #include "arch/types.hh"
 #include "arch/regfile.hh"
+#include "base/callback.hh"
+#include "base/output.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
 #include "sim/process.hh"
+#include "sim/sim_exit.hh"
 
 class Event;
 //class Process;
@@ -65,8 +68,21 @@ struct OzoneThreadState : public ThreadState {
 #if FULL_SYSTEM
     OzoneThreadState(CPUType *_cpu, int _thread_num)
         : ThreadState(-1, _thread_num),
-          intrflag(0), inSyscall(0), trapPending(0)
+          intrflag(0), cpu(_cpu), inSyscall(0), trapPending(0)
     {
+        if (cpu->params->profile) {
+            profile = new FunctionProfile(cpu->params->system->kernelSymtab);
+            Callback *cb =
+                new MakeCallback<OzoneThreadState,
+                &OzoneThreadState::dumpFuncProfile>(this);
+            registerExitCallback(cb);
+        }
+
+        // let's fill with a dummy node for now so we don't get a segfault
+        // on the first cycle when there's no node available.
+        static ProfileNode dummyNode;
+        profileNode = &dummyNode;
+        profilePC = 3;
         miscRegFile.clear();
     }
 #else
@@ -130,6 +146,14 @@ struct OzoneThreadState : public ThreadState {
 
     void setNextPC(uint64_t val)
     { nextPC = val; }
+
+#if FULL_SYSTEM
+    void dumpFuncProfile()
+    {
+        std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name()));
+        profile->dump(tc, *os);
+    }
+#endif
 };
 
 #endif // __CPU_OZONE_THREAD_STATE_HH__
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index c396f5033..88698bfee 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -161,9 +161,9 @@ AtomicSimpleCPU::serialize(ostream &os)
 {
     SimObject::State so_state = SimObject::getState();
     SERIALIZE_ENUM(so_state);
+    BaseSimpleCPU::serialize(os);
     nameOut(os, csprintf("%s.tickEvent", name()));
     tickEvent.serialize(os);
-    BaseSimpleCPU::serialize(os);
 }
 
 void
@@ -171,8 +171,8 @@ AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
 {
     SimObject::State so_state;
     UNSERIALIZE_ENUM(so_state);
-    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
     BaseSimpleCPU::unserialize(cp, section);
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
 }
 
 void
@@ -464,6 +464,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
     Param<Counter> max_insts_all_threads;
     Param<Counter> max_loads_any_thread;
     Param<Counter> max_loads_all_threads;
+    Param<Tick> progress_interval;
     SimObjectParam<MemObject *> mem;
     SimObjectParam<System *> system;
 
@@ -496,6 +497,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
                "terminate when any thread reaches this load count"),
     INIT_PARAM(max_loads_all_threads,
                "terminate when all threads have reached this load count"),
+    INIT_PARAM(progress_interval, "Progress interval"),
     INIT_PARAM(mem, "memory"),
     INIT_PARAM(system, "system object"),
 
@@ -527,6 +529,7 @@ CREATE_SIM_OBJECT(AtomicSimpleCPU)
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
     params->max_loads_all_threads = max_loads_all_threads;
+    params->progress_interval = progress_interval;
     params->deferRegistration = defer_registration;
     params->clock = clock;
     params->functionTrace = function_trace;
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index f801b93fa..522fe79aa 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -170,7 +170,7 @@ BaseSimpleCPU::regStats()
 void
 BaseSimpleCPU::resetStats()
 {
-    startNumInst = numInst;
+//    startNumInst = numInst;
     // notIdleFraction = (_status != Idle);
 }
 
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 5c1654f7e..03ee27e04 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -589,6 +589,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(TimingSimpleCPU)
     Param<Counter> max_insts_all_threads;
     Param<Counter> max_loads_any_thread;
     Param<Counter> max_loads_all_threads;
+    Param<Tick> progress_interval;
     SimObjectParam<MemObject *> mem;
     SimObjectParam<System *> system;
 
@@ -621,6 +622,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(TimingSimpleCPU)
                "terminate when any thread reaches this load count"),
     INIT_PARAM(max_loads_all_threads,
                "terminate when all threads have reached this load count"),
+    INIT_PARAM(progress_interval, "Progress interval"),
     INIT_PARAM(mem, "memory"),
     INIT_PARAM(system, "system object"),
 
@@ -652,6 +654,7 @@ CREATE_SIM_OBJECT(TimingSimpleCPU)
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
     params->max_loads_all_threads = max_loads_all_threads;
+    params->progress_interval = progress_interval;
     params->deferRegistration = defer_registration;
     params->clock = clock;
     params->functionTrace = function_trace;
diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc
index 5f86cf2b7..4fc47c982 100644
--- a/src/cpu/simple_thread.cc
+++ b/src/cpu/simple_thread.cc
@@ -162,6 +162,11 @@ SimpleThread::takeOverFrom(ThreadContext *oldContext)
     if (quiesceEvent) {
         quiesceEvent->tc = tc;
     }
+
+    Kernel::Statistics *stats = oldContext->getKernelStats();
+    if (stats) {
+        kernelStats = stats;
+    }
 #endif
 
     storeCondFailures = 0;
diff --git a/src/cpu/thread_state.hh b/src/cpu/thread_state.hh
index 6e985054f..14e033b7f 100644
--- a/src/cpu/thread_state.hh
+++ b/src/cpu/thread_state.hh
@@ -32,6 +32,7 @@
 #define __CPU_THREAD_STATE_HH__
 
 #include "arch/types.hh"
+#include "cpu/profile.hh"
 #include "cpu/thread_context.hh"
 
 #if !FULL_SYSTEM
@@ -191,6 +192,7 @@ struct ThreadState {
     // simulation only; all functional memory accesses should use
     // one of the FunctionalMemory pointers above.
     short asid;
+
 #endif
 
     /** Current instruction the thread is committing.  Only set and