37 files changed, 685 insertions, 574 deletions
diff --git a/base/timebuf.hh b/base/timebuf.hh
index f6b5b2781..db34528d8 100644
--- a/base/timebuf.hh
+++ b/base/timebuf.hh
@@ -212,6 +212,11 @@ class TimeBuffer
     {
         return wire(this, 0);
     }
+
+    int getSize()
+    {
+        return size;
+    }
 };
 
 #endif // __BASE_TIMEBUF_HH__
diff --git a/cpu/base_dyn_inst.cc b/cpu/base_dyn_inst.cc
index 64a995689..1a52279cc 100644
--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@@ -100,32 +100,15 @@ BaseDynInst<Impl>::initVars()
 
     readyRegs = 0;
 
-    completed = false;
-    resultReady = false;
-    canIssue = false;
-    issued = false;
-    executed = false;
-    canCommit = false;
-    committed = false;
-    squashed = false;
-    squashedInIQ = false;
-    squashedInLSQ = false;
-    squashedInROB = false;
+    instResult.integer = 0;
+
+    status.reset();
+
     eaCalcDone = false;
     memOpDone = false;
+
     lqIdx = -1;
     sqIdx = -1;
-    reachedCommit = false;
-
-    blockingInst = false;
-    recoverInst = false;
-
-    iqEntry = false;
-    robEntry = false;
-
-    serializeBefore = false;
-    serializeAfter = false;
-    serializeHandled = false;
 
     // Eventually make this a parameter.
     threadNumber = 0;
@@ -395,7 +378,7 @@ void
 BaseDynInst<Impl>::markSrcRegReady()
 {
     if (++readyRegs == numSrcRegs()) {
-        canIssue = true;
+        status.set(CanIssue);
     }
 }
 
@@ -403,13 +386,9 @@ template <class Impl>
 void
 BaseDynInst<Impl>::markSrcRegReady(RegIndex src_idx)
 {
-    ++readyRegs;
-
     _readySrcRegIdx[src_idx] = true;
 
-    if (readyRegs == numSrcRegs()) {
-        canIssue = true;
-    }
+    markSrcRegReady();
 }
 
 template <class Impl>
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index 388ea4a8d..01f6be185 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -127,56 +127,34 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** The sequence number of the instruction. */
     InstSeqNum seqNum;
 
-    /** Is the instruction in the IQ */
-    bool iqEntry;
-
-    /** Is the instruction in the ROB */
-    bool robEntry;
-
-    /** Is the instruction in the LSQ */
-    bool lsqEntry;
-
-    /** Is the instruction completed. */
-    bool completed;
-
-    /** Is the instruction's result ready. */
-    bool resultReady;
-
-    /** Can this instruction issue. */
-    bool canIssue;
-
-    /** Has this instruction issued. */
-    bool issued;
-
-    /** Has this instruction executed (or made it through execute) yet. */
-    bool executed;
-
-    /** Can this instruction commit. */
-    bool canCommit;
-
-    /** Is this instruction committed. */
-    bool committed;
-
-    /** Is this instruction squashed. */
-    bool squashed;
-
-    /** Is this instruction squashed in the instruction queue. */
-    bool squashedInIQ;
-
-    /** Is this instruction squashed in the instruction queue. */
-    bool squashedInLSQ;
-
-    /** Is this instruction squashed in the instruction queue. */
-    bool squashedInROB;
-
-    /** Is this a recover instruction. */
-    bool recoverInst;
-
-    /** Is this a thread blocking instruction. */
-    bool blockingInst;	/* this inst has called thread_block() */
+    enum Status {
+        IqEntry,                 /// Instruction is in the IQ
+        RobEntry,                /// Instruction is in the ROB
+        LsqEntry,                /// Instruction is in the LSQ
+        Completed,               /// Instruction has completed
+        ResultReady,             /// Instruction has its result
+        CanIssue,                /// Instruction can issue and execute
+        Issued,                  /// Instruction has issued
+        Executed,                /// Instruction has executed
+        CanCommit,               /// Instruction can commit
+        AtCommit,                /// Instruction has reached commit
+        Committed,               /// Instruction has committed
+        Squashed,                /// Instruction is squashed
+        SquashedInIQ,            /// Instruction is squashed in the IQ
+        SquashedInLSQ,           /// Instruction is squashed in the LSQ
+        SquashedInROB,           /// Instruction is squashed in the ROB
+        RecoverInst,             /// Is a recover instruction
+        BlockingInst,            /// Is a blocking instruction
+        ThreadsyncWait,          /// Is a thread synchronization instruction
+        SerializeBefore,         /// Needs to serialize on
+                                 /// instructions ahead of it
+        SerializeAfter,          /// Needs to serialize instructions behind it
+        SerializeHandled,        /// Serialization has been handled
+        NumStatus
+    };
 
-    /** Is this a thread syncrhonization instruction. */
-    bool threadsyncWait;
+    /** The status of this BaseDynInst.  Several bits can be set. */
+    std::bitset<NumStatus> status;
 
     /** The thread this instruction is from. */
     short threadNumber;
@@ -351,9 +329,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isThreadSync()   const { return staticInst->isThreadSync(); }
     bool isSerializing()  const { return staticInst->isSerializing(); }
     bool isSerializeBefore() const
-    { return staticInst->isSerializeBefore() || serializeBefore; }
+    { return staticInst->isSerializeBefore() || status[SerializeBefore]; }
     bool isSerializeAfter() const
-    { return staticInst->isSerializeAfter() || serializeAfter; }
+    { return staticInst->isSerializeAfter() || status[SerializeAfter]; }
     bool isMemBarrier()   const { return staticInst->isMemBarrier(); }
     bool isWriteBarrier() const { return staticInst->isWriteBarrier(); }
     bool isNonSpeculative() const { return staticInst->isNonSpeculative(); }
@@ -362,41 +340,32 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool isUnverifiable() const { return staticInst->isUnverifiable(); }
 
     /** Temporarily sets this instruction as a serialize before instruction. */
-    void setSerializeBefore() { serializeBefore = true; }
+    void setSerializeBefore() { status.set(SerializeBefore); }
 
     /** Clears the serializeBefore part of this instruction. */
-    void clearSerializeBefore() { serializeBefore = false; }
+    void clearSerializeBefore() { status.reset(SerializeBefore); }
 
     /** Checks if this serializeBefore is only temporarily set. */
-    bool isTempSerializeBefore() { return serializeBefore; }
-
-    /** Tracks if instruction has been externally set as serializeBefore. */
-    bool serializeBefore;
+    bool isTempSerializeBefore() { return status[SerializeBefore]; }
 
     /** Temporarily sets this instruction as a serialize after instruction. */
-    void setSerializeAfter() { serializeAfter = true; }
+    void setSerializeAfter() { status.set(SerializeAfter); }
 
     /** Clears the serializeAfter part of this instruction.*/
-    void clearSerializeAfter() { serializeAfter = false; }
+    void clearSerializeAfter() { status.reset(SerializeAfter); }
 
     /** Checks if this serializeAfter is only temporarily set. */
-    bool isTempSerializeAfter() { return serializeAfter; }
+    bool isTempSerializeAfter() { return status[SerializeAfter]; }
 
-    /** Tracks if instruction has been externally set as serializeAfter. */
-    bool serializeAfter;
+    /** Sets the serialization part of this instruction as handled. */
+    void setSerializeHandled() { status.set(SerializeHandled); }
 
     /** Checks if the serialization part of this instruction has been
      *  handled.  This does not apply to the temporary serializing
      *  state; it only applies to this instruction's own permanent
      *  serializing state.
      */
-    bool isSerializeHandled() { return serializeHandled; }
-
-    /** Sets the serialization part of this instruction as handled. */
-    void setSerializeHandled() { serializeHandled = true; }
-
-    /** Whether or not the serialization of this instruction has been handled. */
-    bool serializeHandled;
+    bool isSerializeHandled() { return status[SerializeHandled]; }
 
     /** Returns the opclass of this instruction. */
     OpClass opClass() const { return staticInst->opClass(); }
@@ -463,106 +432,112 @@ class BaseDynInst : public FastAlloc, public RefCounted
     }
 
     /** Sets this instruction as completed. */
-    void setCompleted() { completed = true; }
+    void setCompleted() { status.set(Completed); }
 
     /** Returns whether or not this instruction is completed. */
-    bool isCompleted() const { return completed; }
+    bool isCompleted() const { return status[Completed]; }
 
-    void setResultReady() { resultReady = true; }
+    /** Marks the result as ready. */
+    void setResultReady() { status.set(ResultReady); }
 
-    bool isResultReady() const { return resultReady; }
+    /** Returns whether or not the result is ready. */
+    bool isResultReady() const { return status[ResultReady]; }
 
     /** Sets this instruction as ready to issue. */
-    void setCanIssue() { canIssue = true; }
+    void setCanIssue() { status.set(CanIssue); }
 
     /** Returns whether or not this instruction is ready to issue. */
-    bool readyToIssue() const { return canIssue; }
+    bool readyToIssue() const { return status[CanIssue]; }
 
     /** Sets this instruction as issued from the IQ. */
-    void setIssued() { issued = true; }
+    void setIssued() { status.set(Issued); }
 
     /** Returns whether or not this instruction has issued. */
-    bool isIssued() const { return issued; }
+    bool isIssued() const { return status[Issued]; }
 
     /** Sets this instruction as executed. */
-    void setExecuted() { executed = true; }
+    void setExecuted() { status.set(Executed); }
 
     /** Returns whether or not this instruction has executed. */
-    bool isExecuted() const { return executed; }
+    bool isExecuted() const { return status[Executed]; }
 
     /** Sets this instruction as ready to commit. */
-    void setCanCommit() { canCommit = true; }
+    void setCanCommit() { status.set(CanCommit); }
 
     /** Clears this instruction as being ready to commit. */
-    void clearCanCommit() { canCommit = false; }
+    void clearCanCommit() { status.reset(CanCommit); }
 
     /** Returns whether or not this instruction is ready to commit. */
-    bool readyToCommit() const { return canCommit; }
+    bool readyToCommit() const { return status[CanCommit]; }
+
+    void setAtCommit() { status.set(AtCommit); }
+
+    bool isAtCommit() { return status[AtCommit]; }
 
     /** Sets this instruction as committed. */
-    void setCommitted() { committed = true; }
+    void setCommitted() { status.set(Committed); }
 
     /** Returns whether or not this instruction is committed. */
-    bool isCommitted() const { return committed; }
+    bool isCommitted() const { return status[Committed]; }
 
     /** Sets this instruction as squashed. */
-    void setSquashed() { squashed = true; }
+    void setSquashed() { status.set(Squashed); }
 
     /** Returns whether or not this instruction is squashed. */
-    bool isSquashed() const { return squashed; }
+    bool isSquashed() const { return status[Squashed]; }
 
     //Instruction Queue Entry
     //-----------------------
     /** Sets this instruction as a entry the IQ. */
-    void setInIQ() { iqEntry = true; }
+    void setInIQ() { status.set(IqEntry); }
 
     /** Sets this instruction as a entry the IQ. */
-    void removeInIQ() { iqEntry = false; }
+    void clearInIQ() { status.reset(IqEntry); }
+
+    /** Returns whether or not this instruction has issued. */
+    bool isInIQ() const { return status[IqEntry]; }
 
     /** Sets this instruction as squashed in the IQ. */
-    void setSquashedInIQ() { squashedInIQ = true; squashed = true;}
+    void setSquashedInIQ() { status.set(SquashedInIQ); status.set(Squashed);}
 
     /** Returns whether or not this instruction is squashed in the IQ. */
-    bool isSquashedInIQ() const { return squashedInIQ; }
-
-    /** Returns whether or not this instruction has issued. */
-    bool isInIQ() const { return iqEntry; }
+    bool isSquashedInIQ() const { return status[SquashedInIQ]; }
 
 
     //Load / Store Queue Functions
     //-----------------------
     /** Sets this instruction as a entry the LSQ. */
-    void setInLSQ() { lsqEntry = true; }
+    void setInLSQ() { status.set(LsqEntry); }
 
     /** Sets this instruction as a entry the LSQ. */
-    void removeInLSQ() { lsqEntry = false; }
+    void removeInLSQ() { status.reset(LsqEntry); }
+
+    /** Returns whether or not this instruction is in the LSQ. */
+    bool isInLSQ() const { return status[LsqEntry]; }
 
     /** Sets this instruction as squashed in the LSQ. */
-    void setSquashedInLSQ() { squashedInLSQ = true;}
+    void setSquashedInLSQ() { status.set(SquashedInLSQ);}
 
     /** Returns whether or not this instruction is squashed in the LSQ. */
-    bool isSquashedInLSQ() const { return squashedInLSQ; }
-
-    /** Returns whether or not this instruction is in the LSQ. */
-    bool isInLSQ() const { return lsqEntry; }
+    bool isSquashedInLSQ() const { return status[SquashedInLSQ]; }
 
 
     //Reorder Buffer Functions
     //-----------------------
     /** Sets this instruction as a entry the ROB. */
-    void setInROB() { robEntry = true; }
+    void setInROB() { status.set(RobEntry); }
 
     /** Sets this instruction as a entry the ROB. */
-    void removeInROB() { robEntry = false; }
+    void clearInROB() { status.reset(RobEntry); }
+
+    /** Returns whether or not this instruction is in the ROB. */
+    bool isInROB() const { return status[RobEntry]; }
 
     /** Sets this instruction as squashed in the ROB. */
-    void setSquashedInROB() { squashedInROB = true; }
+    void setSquashedInROB() { status.set(SquashedInROB); }
 
     /** Returns whether or not this instruction is squashed in the ROB. */
-    bool isSquashedInROB() const { return squashedInROB; }
-
-    /** Returns whether or not this instruction is in the ROB. */
-    bool isInROB() const { return robEntry; }
+    bool isSquashedInROB() const { return status[SquashedInROB]; }
 
     /** Read the PC of this instruction. */
     const Addr readPC() const { return PC; }
@@ -619,8 +594,6 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Store queue index. */
     int16_t sqIdx;
 
-    bool reachedCommit;
-
     /** Iterator pointing to this BaseDynInst in the list of all insts. */
     ListIt instListIt;
 
@@ -636,7 +609,7 @@ template<class T>
 inline Fault
 BaseDynInst<Impl>::read(Addr addr, T &data, unsigned flags)
 {
-    if (executed) {
+    if (status[Executed]) {
         fault = cpu->read(req, data, lqIdx);
         return fault;
     }
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 08d42cd46..c563fbef3 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -94,12 +94,10 @@ Param<unsigned> renameWidth;
 Param<unsigned> commitToIEWDelay;
 Param<unsigned> renameToIEWDelay;
 Param<unsigned> issueToExecuteDelay;
+Param<unsigned> dispatchWidth;
 Param<unsigned> issueWidth;
-Param<unsigned> executeWidth;
-Param<unsigned> executeIntWidth;
-Param<unsigned> executeFloatWidth;
-Param<unsigned> executeBranchWidth;
-Param<unsigned> executeMemoryWidth;
+Param<unsigned> wbWidth;
+Param<unsigned> wbDepth;
 SimObjectParam<FUPool *> fuPool;
 
 Param<unsigned> iewToCommitDelay;
@@ -109,6 +107,9 @@ Param<unsigned> squashWidth;
 Param<Tick> trapLatency;
 Param<Tick> fetchTrapLatency;
 
+Param<unsigned> backComSize;
+Param<unsigned> forwardComSize;
+
 Param<std::string> predType;
 Param<unsigned> localPredictorSize;
 Param<unsigned> localCtrBits;
@@ -219,12 +220,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
                "Issue/Execute/Writeback delay"),
     INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
                "to the IEW stage)"),
+    INIT_PARAM(dispatchWidth, "Dispatch width"),
     INIT_PARAM(issueWidth, "Issue width"),
-    INIT_PARAM(executeWidth, "Execute width"),
-    INIT_PARAM(executeIntWidth, "Integer execute width"),
-    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
-    INIT_PARAM(executeBranchWidth, "Branch execute width"),
-    INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+    INIT_PARAM(wbWidth, "Writeback width"),
+    INIT_PARAM(wbDepth, "Writeback depth (number of cycles it can buffer)"),
     INIT_PARAM_DFLT(fuPool, "Functional unit pool", NULL),
 
     INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
@@ -235,6 +234,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM_DFLT(trapLatency, "Number of cycles before the trap is handled", 6),
     INIT_PARAM_DFLT(fetchTrapLatency, "Number of cycles before the fetch trap is handled", 12),
 
+    INIT_PARAM(backComSize, "Time buffer size for backwards communication"),
+    INIT_PARAM(forwardComSize, "Time buffer size for forward communication"),
+
     INIT_PARAM(predType, "Type of branch predictor ('local', 'tournament')"),
     INIT_PARAM(localPredictorSize, "Size of local predictor"),
     INIT_PARAM(localCtrBits, "Bits per counter"),
@@ -353,12 +355,10 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->commitToIEWDelay = commitToIEWDelay;
     params->renameToIEWDelay = renameToIEWDelay;
     params->issueToExecuteDelay = issueToExecuteDelay;
+    params->dispatchWidth = dispatchWidth;
     params->issueWidth = issueWidth;
-    params->executeWidth = executeWidth;
-    params->executeIntWidth = executeIntWidth;
-    params->executeFloatWidth = executeFloatWidth;
-    params->executeBranchWidth = executeBranchWidth;
-    params->executeMemoryWidth = executeMemoryWidth;
+    params->wbWidth = wbWidth;
+    params->wbDepth = wbDepth;
     params->fuPool = fuPool;
 
     params->iewToCommitDelay = iewToCommitDelay;
@@ -368,6 +368,9 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->trapLatency = trapLatency;
     params->fetchTrapLatency = fetchTrapLatency;
 
+    params->backComSize = backComSize;
+    params->forwardComSize = forwardComSize;
+
     params->predType = predType;
     params->localPredictorSize = localPredictorSize;
     params->localCtrBits = localCtrBits;
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index f39fdf6b6..1bf0652cd 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -383,7 +383,7 @@ AlphaFullCPU<Impl>::AlphaXC::copyArchRegs(ExecContext *xc)
     }
 
     // Copy the misc regs.
-    cpu->regFile.miscRegs[tid].copyMiscRegs(xc);
+    TheISA::copyMiscRegs(xc, this);
 
     // Then finally set the PC and the next PC.
     cpu->setPC(xc->readPC(), tid);
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index f0836a9fd..4ab130d02 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -106,12 +106,10 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned commitToIEWDelay;
     unsigned renameToIEWDelay;
     unsigned issueToExecuteDelay;
+    unsigned dispatchWidth;
     unsigned issueWidth;
-    unsigned executeWidth;
-    unsigned executeIntWidth;
-    unsigned executeFloatWidth;
-    unsigned executeBranchWidth;
-    unsigned executeMemoryWidth;
+    unsigned wbWidth;
+    unsigned wbDepth;
     FUPool *fuPool;
 
     //
@@ -125,6 +123,12 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     Tick fetchTrapLatency;
 
     //
+    // Timebuffer sizes
+    //
+    unsigned backComSize;
+    unsigned forwardComSize;
+
+    //
     // Branch predictor (BP, BTB, RAS)
     //
     std::string predType;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index d93822394..b153effc4 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -160,10 +160,6 @@ class DefaultCommit
     /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
-    void setFetchStage(Fetch *fetch_stage);
-
-    Fetch *fetchStage;
-
     /** Sets the pointer to the IEW stage. */
     void setIEWStage(IEW *iew_stage);
 
@@ -367,11 +363,6 @@ class DefaultCommit
      */
     unsigned renameWidth;
 
-    /** IEW width, in instructions.  Used so ROB knows how many
-     *  instructions to get from the IEW instruction queue.
-     */
-    unsigned iewWidth;
-
     /** Commit width, in instructions. */
     unsigned commitWidth;
 
@@ -392,10 +383,6 @@ class DefaultCommit
      */
     Tick trapLatency;
 
-    Tick fetchTrapLatency;
-
-    Tick fetchFaultTick;
-
     /** The commit PC of each thread.  Refers to the instruction that
      * is currently being processed/committed.
      */
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 798f30294..364e685c2 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -71,12 +71,10 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       renameToROBDelay(params->renameToROBDelay),
       fetchToCommitDelay(params->commitToFetchDelay),
       renameWidth(params->renameWidth),
-      iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
       numThreads(params->numberOfThreads),
       switchedOut(false),
-      trapLatency(params->trapLatency),
-      fetchTrapLatency(params->fetchTrapLatency)
+      trapLatency(params->trapLatency)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -114,10 +112,8 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
         changedROBNumEntries[i] = false;
         trapSquash[i] = false;
         xcSquash[i] = false;
+        PC[i] = nextPC[i] = 0;
     }
-
-    fetchFaultTick = 0;
-    fetchTrapWait = 0;
 }
 
 template <class Impl>
@@ -240,7 +236,6 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     cpu->activateStage(FullCPU::CommitIdx);
 
     trapLatency = cpu->cycles(trapLatency);
-    fetchTrapLatency = cpu->cycles(fetchTrapLatency);
 }
 
 template <class Impl>
@@ -299,13 +294,6 @@ DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template <class Impl>
 void
-DefaultCommit<Impl>::setFetchStage(Fetch *fetch_stage)
-{
-    fetchStage = fetch_stage;
-}
-
-template <class Impl>
-void
 DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
 {
     iewStage = iew_stage;
@@ -431,7 +419,7 @@ DefaultCommit<Impl>::setNextStatus()
         }
     }
 
-    assert(squashes == squashCounter);
+    squashCounter = squashes;
 
     // If commit is currently squashing, then it will have activity for the
     // next cycle. Set its next status as active.
@@ -536,8 +524,6 @@ DefaultCommit<Impl>::squashFromTrap(unsigned tid)
 
     commitStatus[tid] = ROBSquashing;
     cpu->activityThisCycle();
-
-    ++squashCounter;
 }
 
 template <class Impl>
@@ -555,8 +541,6 @@ DefaultCommit<Impl>::squashFromXC(unsigned tid)
     cpu->activityThisCycle();
 
     xcSquash[tid] = false;
-
-    ++squashCounter;
 }
 
 template <class Impl>
@@ -571,6 +555,9 @@ DefaultCommit<Impl>::tick()
         return;
     }
 
+    if ((*activeThreads).size() <=0)
+        return;
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
     // Check if any of the threads are done squashing.  Change the
@@ -582,10 +569,12 @@ DefaultCommit<Impl>::tick()
 
             if (rob->isDoneSquashing(tid)) {
                 commitStatus[tid] = Running;
-                --squashCounter;
             } else {
                 DPRINTF(Commit,"[tid:%u]: Still Squashing, cannot commit any"
-                        "insts this cycle.\n", tid);
+                        " insts this cycle.\n", tid);
+                rob->doSquash(tid);
+                toIEW->commitInfo[tid].robSquashing = true;
+                wroteToTimeBuffer = true;
             }
         }
     }
@@ -691,29 +680,7 @@ DefaultCommit<Impl>::commit()
 
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
-/*
-        if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
-            // Record the fault.  Wait until it's empty in the ROB.
-            // Then handle the trap.  Ignore it if there's already a
-            // trap pending as fetch will be redirected.
-            fetchFault = fromFetch->fetchFault;
-            fetchFaultTick = curTick + fetchTrapLatency;
-            commitStatus[0] = FetchTrapPending;
-            DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
-                    "ROB empties without squashing the fault.\n");
-            fetchTrapWait = 0;
-        }
 
-        // Fetch may tell commit to clear the trap if it's been squashed.
-        if (fromFetch->clearFetchFault) {
-            DPRINTF(Commit, "Received clear fetch fault signal\n");
-            fetchTrapWait = 0;
-            if (commitStatus[0] == FetchTrapPending) {
-                DPRINTF(Commit, "Clearing fault from fetch\n");
-                commitStatus[0] = Running;
-            }
-        }
-*/
         // Not sure which one takes priority.  I think if we have
         // both, that's a bad sign.
         if (trapSquash[tid] == true) {
@@ -741,8 +708,6 @@ DefaultCommit<Impl>::commit()
 
             commitStatus[tid] = ROBSquashing;
 
-            ++squashCounter;
-
             // If we want to include the squashing instruction in the squash,
             // then use one older sequence number.
             InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid];
@@ -944,7 +909,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         // and committed this instruction.
         thread[tid]->funcExeInst--;
 
-        head_inst->reachedCommit = true;
+        head_inst->setAtCommit();
 
         if (head_inst->isNonSpeculative() ||
             head_inst->isStoreConditional() ||
@@ -1060,7 +1025,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
         // Generate trap squash event.
         generateTrapEvent(tid);
-
+//        warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC());
         return false;
 #else // !FULL_SYSTEM
         panic("fault (%d) detected @ PC %08p", inst_fault,
@@ -1083,6 +1048,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                                  head_inst->renamedDestRegIdx(i));
     }
 
+    if (head_inst->isCopy())
+        panic("Should not commit any copy instructions!");
+
     // Finally clear the head ROB entry.
     rob->retireHead(tid);
 
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 8d72bdc41..f1571e61b 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -108,12 +108,14 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       // For now just have these time buffers be pretty big.
       // @todo: Make these time buffer sizes parameters or derived
       // from latencies
-      timeBuffer(5, 5),
-      fetchQueue(5, 5),
-      decodeQueue(5, 5),
-      renameQueue(5, 5),
-      iewQueue(5, 5),
-      activityRec(NumStages, 10, params->activity),
+      timeBuffer(params->backComSize, params->forwardComSize),
+      fetchQueue(params->backComSize, params->forwardComSize),
+      decodeQueue(params->backComSize, params->forwardComSize),
+      renameQueue(params->backComSize, params->forwardComSize),
+      iewQueue(params->backComSize, params->forwardComSize),
+      activityRec(NumStages,
+                  params->backComSize + params->forwardComSize,
+                  params->activity),
 
       globalSeqNum(1),
 
@@ -180,7 +182,6 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
-    commit.setFetchStage(&fetch);
     commit.setIEWStage(&iew);
     rename.setIEWStage(&iew);
     rename.setCommitStage(&commit);
@@ -709,7 +710,7 @@ void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
     // Flush out any old data from the time buffers.
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < timeBuffer.getSize(); ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
         decodeQueue.advance();
@@ -758,6 +759,46 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
         tickEvent.schedule(curTick);
 }
 
+/*
+template <class Impl>
+void
+FullO3CPU<Impl>::serialize(std::ostream &os)
+{
+    BaseCPU::serialize(os);
+    nameOut(os, csprintf("%s.tickEvent", name()));
+    tickEvent.serialize(os);
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // write out the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    for (int i = 0; i < thread.size(); i++) {
+        nameOut(os, csprintf("%s.xc.%i", name(), i));
+        temp.copyXC(thread[i]->getXC());
+        temp.serialize(os);
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
+{
+    BaseCPU::unserialize(cp, section);
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // read in the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    for (int i = 0; i < thread.size(); i++) {
+        temp.copyXC(thread[i]->getXC());
+        temp.unserialize(cp, csprintf("%s.xc.%i", section, i));
+        thread[i]->getXC()->copyArchRegs(temp.getXC());
+    }
+}
+*/
 template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readIntReg(int reg_idx)
@@ -866,7 +907,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegSingle(int reg_idx, float val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegSingle(phys_reg, val);
 }
@@ -875,7 +917,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegDouble(int reg_idx, double val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegDouble(phys_reg, val);
 }
@@ -884,7 +927,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegInt(phys_reg, val);
 }
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index f4b19bfb3..ef5c9ae53 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -63,6 +63,12 @@ class BaseFullCPU : public BaseCPU
 
     void regStats();
 
+    /** Sets this CPU's ID. */
+    void setCpuId(int id) { cpu_id = id; }
+
+    /** Reads this CPU's ID. */
+    int readCpuId() { return cpu_id; }
+
   protected:
     int cpu_id;
 };
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index 0b686375e..e1af4d423 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -278,7 +278,7 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
     toFetch->decodeInfo[tid].doneSeqNum = inst->seqNum;
     toFetch->decodeInfo[tid].predIncorrect = true;
     toFetch->decodeInfo[tid].squash = true;
-    toFetch->decodeInfo[tid].nextPC = inst->readNextPC();
+    toFetch->decodeInfo[tid].nextPC = inst->branchTarget();
     toFetch->decodeInfo[tid].branchTaken =
         inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst));
 
@@ -294,7 +294,7 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
     for (int i=0; i<fromFetch->size; i++) {
         if (fromFetch->insts[i]->threadNumber == tid &&
             fromFetch->insts[i]->seqNum > inst->seqNum) {
-            fromFetch->insts[i]->squashed = true;
+            fromFetch->insts[i]->setSquashed();
         }
     }
 
@@ -343,7 +343,7 @@ DefaultDecode<Impl>::squash(unsigned tid)
 
     for (int i=0; i<fromFetch->size; i++) {
         if (fromFetch->insts[i]->threadNumber == tid) {
-            fromFetch->insts[i]->squashed = true;
+            fromFetch->insts[i]->setSquashed();
             squash_count++;
         }
     }
@@ -721,9 +721,8 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid)
         // Go ahead and compute any PC-relative branches.
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
             ++decodeBranchResolved;
-            inst->setNextPC(inst->branchTarget());
 
-            if (inst->mispredicted()) {
+            if (inst->branchTarget() != inst->readPredTarg()) {
                 ++decodeBranchMispred;
 
                 // Might want to set some sort of boolean and just do
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index 92a87ab54..0bde56ce9 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -358,6 +358,12 @@ class DefaultFetch
     /** The cache line being fetched. */
     uint8_t *cacheData[Impl::MaxThreads];
 
+    /** The PC of the cacheline that has been loaded. */
+    Addr cacheDataPC[Impl::MaxThreads];
+
+    /** Whether or not the cache data is valid. */
+    bool cacheDataValid[Impl::MaxThreads];
+
     /** Size of instructions. */
     int instSize;
 
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index a309bd49a..cc09c4a41 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -138,6 +138,8 @@ DefaultFetch<Impl>::DefaultFetch(Params *params)
 
         // Create space to store a cache line.
         cacheData[tid] = new uint8_t[cacheBlkSize];
+        cacheDataPC[tid] = 0;
+        cacheDataValid[tid] = false;
 
         stalls[tid].decode = 0;
         stalls[tid].rename = 0;
@@ -334,6 +336,7 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Wake up the CPU (if it went to sleep and was waiting on this completion
     // event).
     cpu->wakeCPU();
+    cacheDataValid[tid] = true;
 
     DPRINTF(Activity, "[tid:%u] Activating fetch due to cache completion\n",
             tid);
@@ -466,7 +469,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    if (interruptPending && flags == 0 || switchedOut) {
+    if (interruptPending && flags == 0) {
         // Hold off fetch from getting new instructions while an interrupt
         // is pending.
         return false;
@@ -475,6 +478,11 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // Align the fetch PC so it's at the start of a cache block.
     fetch_PC = icacheBlockAlignPC(fetch_PC);
 
+    // If we've already got the block, no need to try to fetch it again.
+    if (cacheDataValid[tid] && fetch_PC == cacheDataPC[tid]) {
+        return true;
+    }
+
     // Setup the memReq to do a read of the first instruction's address.
     // Set the appropriate read size and flags as well.
     memReq[tid] = new MemReq();
@@ -525,6 +533,9 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 
             MemAccessResult result = icacheInterface->access(memReq[tid]);
 
+            cacheDataPC[tid] = fetch_PC;
+            cacheDataValid[tid] = false;
+
             fetchedCacheLines++;
 
             // If the cache missed, then schedule an event to wake
@@ -1002,8 +1013,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("%lli: Quiesce instruction encountered, halting fetch!",
-                     curTick);
+//                warn("%lli: Quiesce instruction encountered, halting fetch!",
+//                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1067,7 +1078,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
-        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
+//        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index eda6a6bc0..d21c573fe 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -224,6 +224,47 @@ class DefaultIEW
     /** Returns if the LSQ has any stores to writeback. */
     bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
 
+    void incrWb(InstSeqNum &sn)
+    {
+        if (++wbOutstanding == wbMax)
+            ableToIssue = false;
+        DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding <= wbMax);
+#ifdef DEBUG
+        wbList.insert(sn);
+#endif
+    }
+
+    void decrWb(InstSeqNum &sn)
+    {
+        if (wbOutstanding-- == wbMax)
+            ableToIssue = true;
+        DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding >= 0);
+#ifdef DEBUG
+        assert(wbList.find(sn) != wbList.end());
+        wbList.erase(sn);
+#endif
+    }
+
+#ifdef DEBUG
+    std::set<InstSeqNum> wbList;
+
+    void dumpWb()
+    {
+        std::set<InstSeqNum>::iterator wb_it = wbList.begin();
+        while (wb_it != wbList.end()) {
+            cprintf("[sn:%lli]\n",
+                    (*wb_it));
+            wb_it++;
+        }
+    }
+#endif
+
+    bool canIssue() { return ableToIssue; }
+
+    bool ableToIssue;
+
   private:
     /** Sends commit proper information for a squash due to a branch
      * mispredict.
@@ -281,6 +322,9 @@ class DefaultIEW
     /** Processes inputs and changes state accordingly. */
     void checkSignalsAndUpdate(unsigned tid);
 
+    /** Removes instructions from rename from a thread's instruction list. */
+    void emptyRenameInsts(unsigned tid);
+
     /** Sorts instructions coming from rename into lists separated by thread. */
     void sortInsts();
 
@@ -401,20 +445,12 @@ class DefaultIEW
      */
     unsigned issueToExecuteDelay;
 
-    /** Width of issue's read path, in instructions.  The read path is both
-     *  the skid buffer and the rename instruction queue.
-     *  Note to self: is this really different than issueWidth?
-     */
-    unsigned issueReadWidth;
+    /** Width of dispatch, in instructions. */
+    unsigned dispatchWidth;
 
     /** Width of issue, in instructions. */
     unsigned issueWidth;
 
-    /** Width of execute, in instructions.  Might make more sense to break
-     *  down into FP vs int.
-     */
-    unsigned executeWidth;
-
     /** Index into queue of instructions being written back. */
     unsigned wbNumInst;
 
@@ -425,6 +461,17 @@ class DefaultIEW
      */
     unsigned wbCycle;
 
+    /** Number of instructions in flight that will writeback. */
+    int wbOutstanding;
+
+    /** Writeback width. */
+    unsigned wbWidth;
+
+    /** Writeback width * writeback depth, where writeback depth is
+     * the number of cycles of writing back instructions that can be
+     * buffered. */
+    unsigned wbMax;
+
     /** Number of active threads. */
     unsigned numThreads;
 
@@ -459,14 +506,6 @@ class DefaultIEW
     Stats::Scalar<> iewIQFullEvents;
     /** Stat for number of times the LSQ becomes full. */
     Stats::Scalar<> iewLSQFullEvents;
-    /** Stat for total number of executed instructions. */
-    Stats::Scalar<> iewExecutedInsts;
-    /** Stat for total number of executed load instructions. */
-    Stats::Vector<> iewExecLoadInsts;
-    /** Stat for total number of executed store instructions. */
-//    Stats::Scalar<> iewExecStoreInsts;
-    /** Stat for total number of squashed instructions skipped at execute. */
-    Stats::Scalar<> iewExecSquashedInsts;
     /** Stat for total number of memory ordering violation events. */
     Stats::Scalar<> memOrderViolationEvents;
     /** Stat for total number of incorrect predicted taken branches. */
@@ -476,28 +515,27 @@ class DefaultIEW
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
 
+    /** Stat for total number of executed instructions. */
+    Stats::Scalar<> iewExecutedInsts;
+    /** Stat for total number of executed load instructions. */
+    Stats::Vector<> iewExecLoadInsts;
+    /** Stat for total number of executed store instructions. */
+//    Stats::Scalar<> iewExecStoreInsts;
+    /** Stat for total number of squashed instructions skipped at execute. */
+    Stats::Scalar<> iewExecSquashedInsts;
     /** Number of executed software prefetches. */
-    Stats::Vector<> exeSwp;
+    Stats::Vector<> iewExecutedSwp;
     /** Number of executed nops. */
-    Stats::Vector<> exeNop;
+    Stats::Vector<> iewExecutedNop;
     /** Number of executed meomory references. */
-    Stats::Vector<> exeRefs;
+    Stats::Vector<> iewExecutedRefs;
     /** Number of executed branches. */
-    Stats::Vector<> exeBranches;
-
-//    Stats::Vector<> issued_ops;
-/*
-    Stats::Vector<> stat_fu_busy;
-    Stats::Vector2d<> stat_fuBusy;
-    Stats::Vector<> dist_unissued;
-    Stats::Vector2d<> stat_issued_inst_type;
-*/
-    /** Number of instructions issued per cycle. */
-    Stats::Formula issueRate;
+    Stats::Vector<> iewExecutedBranches;
     /** Number of executed store instructions. */
     Stats::Formula iewExecStoreInsts;
-//    Stats::Formula issue_op_rate;
-//    Stats::Formula fu_busy_rate;
+    /** Number of instructions executed per cycle. */
+    Stats::Formula iewExecRate;
+
     /** Number of instructions sent to commit. */
     Stats::Vector<> iewInstsToCommit;
     /** Number of instructions that writeback. */
@@ -510,7 +548,6 @@ class DefaultIEW
      * to resource contention.
      */
     Stats::Vector<> wbPenalized;
-
     /** Number of instructions per cycle written back. */
     Stats::Formula wbRate;
     /** Average number of woken instructions per writeback. */
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 3ed20cb75..102be4f8d 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -56,9 +56,11 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
     if (iewStage->isSwitchedOut()) {
+        iewStage->decrWb(inst->seqNum);
         inst = NULL;
         return;
     } else if (inst->isSquashed()) {
+        iewStage->decrWb(inst->seqNum);
         iewStage->wakeCPU();
         inst = NULL;
         return;
@@ -93,16 +95,17 @@ DefaultIEW<Impl>::LdWritebackEvent::description()
 template<class Impl>
 DefaultIEW<Impl>::DefaultIEW(Params *params)
     : // @todo: Make this into a parameter.
-      issueToExecQueue(5, 5),
+      issueToExecQueue(params->backComSize, params->forwardComSize),
       instQueue(params),
       ldstQueue(params),
       fuPool(params->fuPool),
       commitToIEWDelay(params->commitToIEWDelay),
       renameToIEWDelay(params->renameToIEWDelay),
       issueToExecuteDelay(params->issueToExecuteDelay),
-      issueReadWidth(params->issueWidth),
+      dispatchWidth(params->dispatchWidth),
       issueWidth(params->issueWidth),
-      executeWidth(params->executeWidth),
+      wbOutstanding(0),
+      wbWidth(params->wbWidth),
       numThreads(params->numberOfThreads),
       switchedOut(false)
 {
@@ -125,8 +128,12 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
         fetchRedirect[i] = false;
     }
 
+    wbMax = wbWidth * params->wbDepth;
+
     updateLSQNextCycle = false;
 
+    ableToIssue = true;
+
     skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
 }
 
@@ -144,6 +151,7 @@ DefaultIEW<Impl>::regStats()
     using namespace Stats;
 
     instQueue.regStats();
+    ldstQueue.regStats();
 
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
@@ -189,20 +197,6 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewLSQFullEvents")
         .desc("Number of times the LSQ has become full, causing a stall");
 
-    iewExecutedInsts
-        .name(name() + ".iewExecutedInsts")
-        .desc("Number of executed instructions");
-
-    iewExecLoadInsts
-        .init(cpu->number_of_threads)
-        .name(name() + ".iewExecLoadInsts")
-        .desc("Number of load instructions executed")
-        .flags(total);
-
-    iewExecSquashedInsts
-        .name(name() + ".iewExecSquashedInsts")
-        .desc("Number of squashed instructions skipped in execute");
-
     memOrderViolationEvents
         .name(name() + ".memOrderViolationEvents")
         .desc("Number of memory order violations");
@@ -221,47 +215,49 @@ DefaultIEW<Impl>::regStats()
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 
-    exeSwp
+    iewExecutedInsts
+        .name(name() + ".iewExecutedInsts")
+        .desc("Number of executed instructions");
+
+    iewExecLoadInsts
+        .init(cpu->number_of_threads)
+        .name(name() + ".iewExecLoadInsts")
+        .desc("Number of load instructions executed")
+        .flags(total);
+
+    iewExecSquashedInsts
+        .name(name() + ".iewExecSquashedInsts")
+        .desc("Number of squashed instructions skipped in execute");
+
+    iewExecutedSwp
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:swp")
         .desc("number of swp insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeNop
+    iewExecutedNop
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:nop")
         .desc("number of nop insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeRefs
+    iewExecutedRefs
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:refs")
         .desc("number of memory reference insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeBranches
+    iewExecutedBranches
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:branches")
         .desc("Number of branches executed")
-        .flags(total)
-        ;
-
-    issueRate
-        .name(name() + ".EXEC:rate")
-        .desc("Inst execution rate")
-        .flags(total)
-        ;
-    issueRate = iewExecutedInsts / cpu->numCycles;
+        .flags(total);
 
     iewExecStoreInsts
         .name(name() + ".EXEC:stores")
         .desc("Number of stores executed")
-        .flags(total)
-        ;
-    iewExecStoreInsts = exeRefs - iewExecLoadInsts;
+        .flags(total);
+    iewExecStoreInsts = iewExecutedRefs - iewExecLoadInsts;
 /*
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
@@ -277,58 +273,50 @@ DefaultIEW<Impl>::regStats()
         .init(cpu->number_of_threads)
         .name(name() + ".WB:sent")
         .desc("cumulative count of insts sent to commit")
-        .flags(total)
-        ;
+        .flags(total);
 
     writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
-        .flags(total)
-        ;
+        .flags(total);
 
     producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
-        .flags(total)
-        ;
+        .flags(total);
 
     consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalizedRate = wbPenalized / writebackCount;
 
     wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbFanout = producerInst / consumerInst;
 
     wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
-        .flags(total)
-        ;
+        .flags(total);
     wbRate = writebackCount / cpu->numCycles;
 }
 
@@ -481,8 +469,7 @@ DefaultIEW<Impl>::takeOverFrom()
 
     updateLSQNextCycle = false;
 
-    // @todo: Fix hardcoded number
-    for (int i = 0; i < 6; ++i) {
+    for (int i = 0; i < issueToExecQueue.getSize(); ++i) {
         issueToExecQueue.advance();
     }
 }
@@ -515,16 +502,7 @@ DefaultIEW<Impl>::squash(unsigned tid)
         skidBuffer[tid].pop();
     }
 
-    while (!insts[tid].empty()) {
-        if (insts[tid].front()->isLoad() ||
-            insts[tid].front()->isStore() ) {
-            toRename->iewInfo[tid].dispatchedToLSQ++;
-        }
-
-        toRename->iewInfo[tid].dispatched++;
-
-        insts[tid].pop();
-    }
+    emptyRenameInsts(tid);
 }
 
 template<class Impl>
@@ -650,14 +628,16 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
     // free slot.
     while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
         ++wbNumInst;
-        if (wbNumInst == issueWidth) {
+        if (wbNumInst == wbWidth) {
             ++wbCycle;
             wbNumInst = 0;
         }
 
-        assert(wbCycle < 5);
+        assert((wbCycle * wbWidth + wbNumInst) <= wbMax);
     }
 
+    DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n",
+            wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst);
     // Add finished instruction to queue to commit.
     (*iewQueue)[wbCycle].insts[wbNumInst] = inst;
     (*iewQueue)[wbCycle].size++;
@@ -670,7 +650,7 @@ DefaultIEW<Impl>::validInstsFromRename()
     unsigned inst_count = 0;
 
     for (int i=0; i<fromRename->size; i++) {
-        if (!fromRename->insts[i]->squashed)
+        if (!fromRename->insts[i]->isSquashed())
             inst_count++;
     }
 
@@ -858,10 +838,12 @@ DefaultIEW<Impl>::checkSignalsAndUpdate(unsigned tid)
     }
 
     if (fromCommit->commitInfo[tid].robSquashing) {
-        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n");
+        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n", tid);
 
         dispatchStatus[tid] = Squashing;
 
+        emptyRenameInsts(tid);
+        wroteToTimeBuffer = true;
         return;
     }
 
@@ -912,6 +894,22 @@ DefaultIEW<Impl>::sortInsts()
 
 template <class Impl>
 void
+DefaultIEW<Impl>::emptyRenameInsts(unsigned tid)
+{
+    while (!insts[tid].empty()) {
+        if (insts[tid].front()->isLoad() ||
+            insts[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
+
+        toRename->iewInfo[tid].dispatched++;
+
+        insts[tid].pop();
+    }
+}
+
+template <class Impl>
+void
 DefaultIEW<Impl>::wakeCPU()
 {
     cpu->wakeCPU();
@@ -1010,7 +1008,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
     // Loop through the instructions, putting them in the instruction
     // queue.
     for ( ; dis_num_inst < insts_to_add &&
-              dis_num_inst < issueReadWidth;
+              dis_num_inst < dispatchWidth;
           ++dis_num_inst)
     {
         inst = insts_to_dispatch.front();
@@ -1149,7 +1147,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             instQueue.recordProducer(inst);
 
-            exeNop[tid]++;
+            iewExecutedNop[tid]++;
 
             add_to_iq = false;
         } else if (inst->isExecuted()) {
@@ -1263,6 +1261,7 @@ DefaultIEW<Impl>::executeInsts()
 
             ++iewExecSquashedInsts;
 
+            decrWb(inst->seqNum);
             continue;
         }
 
@@ -1399,8 +1398,8 @@ DefaultIEW<Impl>::writebackInsts()
         DynInstPtr inst = toCommit->insts[inst_num];
         int tid = inst->threadNumber;
 
-        DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
-                inst->readPC());
+        DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %#x.\n",
+                inst->seqNum, inst->readPC());
 
         iewInstsToCommit[tid]++;
 
@@ -1425,6 +1424,8 @@ DefaultIEW<Impl>::writebackInsts()
             }
             writebackCount[tid]++;
         }
+
+        decrWb(inst->seqNum);
     }
 }
 
@@ -1561,7 +1562,7 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exeSwp[thread_number]++;
+        iewExecutedSwp[thread_number]++;
     else
         iewExecutedInsts++;
 #else
@@ -1572,13 +1573,13 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //  Control operations
     //
     if (inst->isControl())
-        exeBranches[thread_number]++;
+        iewExecutedBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exeRefs[thread_number]++;
+        iewExecutedRefs[thread_number]++;
 
         if (inst->isLoad()) {
             iewExecLoadInsts[thread_number]++;
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 4802cbaf4..80cd71f0d 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -490,8 +490,6 @@ class InstructionQueue
 
     /** Number of instructions issued per cycle. */
     Stats::Formula issueRate;
-//    Stats::Formula issue_stores;
-//    Stats::Formula issue_op_rate;
     /** Number of times the FU was busy. */
     Stats::Vector<> fuBusy;
     /** Number of times the FU was busy per instruction issued. */
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index d677a259c..72cb0d708 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -288,22 +288,7 @@ InstructionQueue<Impl>::regStats()
         .flags(total)
         ;
     issueRate = iqInstsIssued / cpu->numCycles;
-/*
-    issue_stores
-        .name(name() + ".ISSUE:stores")
-        .desc("Number of stores issued")
-        .flags(total)
-        ;
-    issue_stores = exe_refs - exe_loads;
-*/
-/*
-    issue_op_rate
-        .name(name() + ".ISSUE:op_rate")
-        .desc("Operation issue rate")
-        .flags(total)
-        ;
-    issue_op_rate = issued_ops / numCycles;
-*/
+
     statFuBusy
         .init(Num_OpClasses)
         .name(name() + ".ISSUE:fu_full")
@@ -700,6 +685,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     int total_issued = 0;
 
     while (total_issued < totalWidth &&
+           iewStage->canIssue() &&
            order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
@@ -790,13 +776,14 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // complete.
                 ++freeEntries;
                 count[tid]--;
-                issuing_inst->removeInIQ();
+                issuing_inst->clearInIQ();
             } else {
                 memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
             statIssuedInstType[tid][op_class]++;
+            iewStage->incrWb(issuing_inst->seqNum);
         } else {
             statFuBusy[op_class]++;
             fuBusy[tid]++;
@@ -1096,7 +1083,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
             // inst will flow through the rest of the pipeline.
             squashed_inst->setIssued();
             squashed_inst->setCanCommit();
-            squashed_inst->removeInIQ();
+            squashed_inst->clearInIQ();
 
             //Update Thread IQ Count
             count[squashed_inst->threadNumber]--;
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index b321d4590..c67225bc0 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -62,6 +62,9 @@ class LSQ {
     /** Returns the name of the LSQ. */
     std::string name() const;
 
+    /** Registers the statistics for each LSQ Unit. */
+    void regStats();
+
     /** Sets the pointer to the list of active threads. */
     void setActiveThreads(std::list<unsigned> *at_ptr);
     /** Sets the CPU pointer. */
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index a6ad27522..a8a55af1a 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -106,6 +106,16 @@ LSQ<Impl>::name() const
 
 template<class Impl>
 void
+LSQ<Impl>::regStats()
+{
+    //Initialize LSQs
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].regStats();
+    }
+}
+
+template<class Impl>
+void
 LSQ<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
     activeThreads = at_ptr;
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index a6afff743..fe174a97d 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -101,6 +101,9 @@ class LSQUnit {
     /** Returns the name of the LSQ unit. */
     std::string name() const;
 
+    /** Registers statistics. */
+    void regStats();
+
     /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr)
     { cpu = cpu_ptr; }
@@ -153,9 +156,6 @@ class LSQUnit {
     /** Writes back stores. */
     void writebackStores();
 
-    // @todo: Include stats in the LSQ unit.
-    //void regStats();
-
     /** Clears all the entries in the LQ. */
     void clearLQ();
 
@@ -369,25 +369,34 @@ class LSQUnit {
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
-/*
-    // total number of loads forwaded from LSQ stores
-    Stats::Vector<> lsq_forw_loads;
+    /** Total number of loads forwaded from LSQ stores. */
+    Stats::Scalar<> lsqForwLoads;
+
+    /** Total number of loads ignored due to invalid addresses. */
+    Stats::Scalar<> invAddrLoads;
+
+    /** Total number of squashed loads. */
+    Stats::Scalar<> lsqSquashedLoads;
 
-    // total number of loads ignored due to invalid addresses
-    Stats::Vector<> inv_addr_loads;
+    /** Total number of responses from the memory system that are
+     * ignored due to the instruction already being squashed. */
+    Stats::Scalar<> lsqIgnoredResponses;
 
-    // total number of software prefetches ignored due to invalid addresses
-    Stats::Vector<> inv_addr_swpfs;
+    /** Total number of squashed stores. */
+    Stats::Scalar<> lsqSquashedStores;
 
-    // total non-speculative bogus addresses seen (debug var)
-    Counter sim_invalid_addrs;
-    Stats::Vector<> fu_busy;  //cumulative fu busy
+    /** Total number of software prefetches ignored due to invalid addresses. */
+    Stats::Scalar<> invAddrSwpfs;
 
-    // ready loads blocked due to memory disambiguation
-    Stats::Vector<> lsq_blocked_loads;
+    /** Ready loads blocked due to partial store-forwarding. */
+    Stats::Scalar<> lsqBlockedLoads;
+
+    /** Number of loads that were rescheduled. */
+    Stats::Scalar<> lsqRescheduledLoads;
+
+    /** Number of times the LSQ is blocked due to the cache. */
+    Stats::Scalar<> lsqCacheBlocked;
 
-    Stats::Scalar<> lsqInversion;
-*/
   public:
     /** Executes the load at the given index. */
     template <class T>
@@ -441,8 +450,9 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
     if (req->flags & UNCACHEABLE &&
-        (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
+        (load_idx != loadHead || !loadQueue[load_idx]->isAtCommit())) {
         iewStage->rescheduleMemInst(loadQueue[load_idx]);
+        ++lsqRescheduledLoads;
         return TheISA::genMachineCheckFault();
     }
 
@@ -552,6 +562,8 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             // Tell IQ/mem dep unit that this instruction will need to be
             // rescheduled eventually
             iewStage->rescheduleMemInst(loadQueue[load_idx]);
+            iewStage->decrWb(loadQueue[load_idx]->seqNum);
+            ++lsqRescheduledLoads;
 
             // Do not generate a writeback event as this instruction is not
             // complete.
@@ -559,6 +571,7 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
                     "Store idx %i to load addr %#x\n",
                     store_idx, req->vaddr);
 
+            ++lsqBlockedLoads;
             return NoFault;
         }
     }
@@ -579,6 +592,10 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // if we have a cache, do cache access too
     if (fault == NoFault && dcacheInterface) {
         if (dcacheInterface->isBlocked()) {
+            ++lsqCacheBlocked;
+
+            iewStage->decrWb(inst->seqNum);
+
             // There's an older load that's already going to squash.
             if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
                 return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index 4ee8bb234..5cc3078f8 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -126,6 +126,47 @@ LSQUnit<Impl>::name() const
 
 template<class Impl>
 void
+LSQUnit<Impl>::regStats()
+{
+    lsqForwLoads
+        .name(name() + ".forwLoads")
+        .desc("Number of loads that had data forwarded from stores");
+
+    invAddrLoads
+        .name(name() + ".invAddrLoads")
+        .desc("Number of loads ignored due to an invalid address");
+
+    lsqSquashedLoads
+        .name(name() + ".squashedLoads")
+        .desc("Number of loads squashed");
+
+    lsqIgnoredResponses
+        .name(name() + ".ignoredResponses")
+        .desc("Number of memory responses ignored because the instruction is squashed");
+
+    lsqSquashedStores
+        .name(name() + ".squashedStores")
+        .desc("Number of stores squashed");
+
+    invAddrSwpfs
+        .name(name() + ".invAddrSwpfs")
+        .desc("Number of software prefetches ignored due to an invalid address");
+
+    lsqBlockedLoads
+        .name(name() + ".blockedLoads")
+        .desc("Number of blocked loads due to partial load-store forwarding");
+
+    lsqRescheduledLoads
+        .name(name() + ".rescheduledLoads")
+        .desc("Number of loads that were rescheduled");
+
+    lsqCacheBlocked
+        .name(name() + ".cacheBlocked")
+        .desc("Number of times an access to memory failed due to the cache being blocked");
+}
+
+template<class Impl>
+void
 LSQUnit<Impl>::clearLQ()
 {
     loadQueue.clear();
@@ -548,6 +589,7 @@ LSQUnit<Impl>::writebackStores()
         if (dcacheInterface && dcacheInterface->isBlocked()) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
                     " is blocked!\n");
+            ++lsqCacheBlocked;
             break;
         }
 
@@ -705,7 +747,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        loadQueue[load_idx]->squashed = true;
+        loadQueue[load_idx]->setSquashed();
         loadQueue[load_idx] = NULL;
         --loads;
 
@@ -748,7 +790,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        storeQueue[store_idx].inst->squashed = true;
+        storeQueue[store_idx].inst->setSquashed();
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
@@ -765,6 +807,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         storeTail = store_idx;
 
         decrStIdx(store_idx);
+        ++lsqSquashedStores;
     }
 }
 
diff --git a/cpu/o3/mem_dep_unit.cc b/cpu/o3/mem_dep_unit.cc
index ccdd1a515..b0f91d44f 100644
--- a/cpu/o3/mem_dep_unit.cc
+++ b/cpu/o3/mem_dep_unit.cc
@@ -35,6 +35,7 @@
 // AlphaSimpleImpl.
 template class MemDepUnit<StoreSet, AlphaSimpleImpl>;
 
+#ifdef DEBUG
 template <>
 int
 MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_count = 0;
@@ -44,3 +45,4 @@ MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_insert = 0;
 template <>
 int
 MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_erase = 0;
+#endif
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 595e9293f..bfe694bd8 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -59,7 +59,9 @@ MemDepUnit<MemDepPred, Impl>::~MemDepUnit()
         }
     }
 
+#ifdef DEBUG
     assert(MemDepEntry::memdep_count == 0);
+#endif
 }
 
 template <class MemDepPred, class Impl>
@@ -141,7 +143,9 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
     // Add the MemDepEntry to the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     instList[tid].push_back(inst);
 
@@ -227,7 +231,9 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
     // Insert the MemDepEntry into the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     // Add the instruction to the list.
     instList[tid].push_back(inst);
@@ -275,7 +281,9 @@ MemDepUnit<MemDepPred, Impl>::insertBarrier(DynInstPtr &barr_inst)
     // Add the MemDepEntry to the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(barr_sn, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     // Add the instruction to the instruction list.
     instList[tid].push_back(barr_inst);
@@ -375,7 +383,9 @@ MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
     (*hash_it).second = NULL;
 
     memDepHash.erase(hash_it);
+#ifdef DEBUG
     MemDepEntry::memdep_erase++;
+#endif
 }
 
 template <class MemDepPred, class Impl>
@@ -470,7 +480,9 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
         (*hash_it).second = NULL;
 
         memDepHash.erase(hash_it);
+#ifdef DEBUG
         MemDepEntry::memdep_erase++;
+#endif
 
         instList[tid].erase(squash_it--);
     }
@@ -551,5 +563,7 @@ MemDepUnit<MemDepPred, Impl>::dumpLists()
 
     cprintf("Memory dependence hash size: %i\n", memDepHash.size());
 
+#ifdef DEBUG
     cprintf("Memory dependence entries: %i\n", MemDepEntry::memdep_count);
+#endif
 }
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index ed1238d36..76c43d3a1 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -223,10 +223,10 @@ class PhysRegFile
 
   public:
     /** (signed) integer register file. */
-    std::vector<IntReg> intRegFile;
+    IntReg *intRegFile;
 
     /** Floating point register file. */
-    std::vector<FloatReg> floatRegFile;
+    FloatReg *floatRegFile;
 
     /** Miscellaneous register file. */
     MiscRegFile miscRegs[Impl::MaxThreads];
@@ -256,11 +256,15 @@ PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
     : numPhysicalIntRegs(_numPhysicalIntRegs),
       numPhysicalFloatRegs(_numPhysicalFloatRegs)
 {
-    intRegFile.resize(numPhysicalIntRegs);
-    floatRegFile.resize(numPhysicalFloatRegs);
+    intRegFile = new IntReg[numPhysicalIntRegs];
+    floatRegFile = new FloatReg[numPhysicalFloatRegs];
 
-    //memset(intRegFile, 0, sizeof(*intRegFile));
-    //memset(floatRegFile, 0, sizeof(*floatRegFile));
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        miscRegs[i].clear();
+    }
+
+    memset(intRegFile, 0, sizeof(*intRegFile));
+    memset(floatRegFile, 0, sizeof(*floatRegFile));
 }
 
 #endif
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 829c99584..93f5b3504 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -348,7 +348,7 @@ DefaultRename<Impl>::squash(unsigned tid)
 
     for (int i=0; i<fromDecode->size; i++) {
         if (fromDecode->insts[i]->threadNumber == tid) {
-            fromDecode->insts[i]->squashed = true;
+            fromDecode->insts[i]->setSquashed();
             wroteToTimeBuffer = true;
             squashCount++;
         }
@@ -1029,7 +1029,7 @@ DefaultRename<Impl>::validInsts()
     unsigned inst_count = 0;
 
     for (int i=0; i<fromDecode->size; i++) {
-        if (!fromDecode->insts[i]->squashed)
+        if (!fromDecode->insts[i]->isSquashed())
             inst_count++;
     }
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index bdbdde32f..2043e0b34 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -305,7 +305,7 @@ class ROB
 
   private:
     /** The sequence number of the squashed instruction. */
-    InstSeqNum squashedSeqNum;
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
     /** Is the ROB done squashing. */
     bool doneSquashing[Impl::MaxThreads];
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 25e0c80fd..62c4d9cf7 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -38,10 +38,10 @@ ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth,
     : numEntries(_numEntries),
       squashWidth(_squashWidth),
       numInstsInROB(0),
-      squashedSeqNum(0),
       numThreads(_numThreads)
 {
     for (int tid=0; tid  < numThreads; tid++) {
+        squashedSeqNum[tid] = 0;
         doneSquashing[tid] = true;
         threadEntries[tid] = 0;
     }
@@ -274,7 +274,7 @@ ROB<Impl>::retireHead(unsigned tid)
     --numInstsInROB;
     --threadEntries[tid];
 
-    head_inst->removeInROB();
+    head_inst->clearInROB();
     head_inst->setCommitted();
 
     instList[tid].erase(head_it);
@@ -349,11 +349,11 @@ void
 ROB<Impl>::doSquash(unsigned tid)
 {
     DPRINTF(ROB, "[tid:%u]: Squashing instructions until [sn:%i].\n",
-            tid, squashedSeqNum);
+            tid, squashedSeqNum[tid]);
 
     assert(squashIt[tid] != instList[tid].end());
 
-    if ((*squashIt[tid])->seqNum < squashedSeqNum) {
+    if ((*squashIt[tid])->seqNum < squashedSeqNum[tid]) {
         DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
                 tid);
 
@@ -368,7 +368,7 @@ ROB<Impl>::doSquash(unsigned tid)
     for (int numSquashed = 0;
          numSquashed < squashWidth &&
          squashIt[tid] != instList[tid].end() &&
-         (*squashIt[tid])->seqNum > squashedSeqNum;
+         (*squashIt[tid])->seqNum > squashedSeqNum[tid];
          ++numSquashed)
     {
         DPRINTF(ROB, "[tid:%u]: Squashing instruction PC %#x, seq num %i.\n",
@@ -405,7 +405,7 @@ ROB<Impl>::doSquash(unsigned tid)
 
 
     // Check if ROB is done squashing.
-    if ((*squashIt[tid])->seqNum <= squashedSeqNum) {
+    if ((*squashIt[tid])->seqNum <= squashedSeqNum[tid]) {
         DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
                 tid);
 
@@ -517,7 +517,7 @@ ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
 
     doneSquashing[tid] = false;
 
-    squashedSeqNum = squash_num;
+    squashedSeqNum[tid] = squash_num;
 
     if (!instList[tid].empty()) {
         InstIt tail_thread = instList[tid].end();
diff --git a/cpu/ozone/cpu.hh b/cpu/ozone/cpu.hh
index 5af2b02b2..c272528b1 100644
--- a/cpu/ozone/cpu.hh
+++ b/cpu/ozone/cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -79,13 +79,13 @@ template <class>
 class Checker;
 
 /**
- * Declaration of Out-of-Order CPU class.  Basically it is a SimpleCPU with
- * simple out-of-order capabilities added to it.  It is still a 1 CPI machine
- * (?), but is capable of handling cache misses.  Basically it models having
- * a ROB/IQ by only allowing a certain amount of instructions to execute while
- * the cache miss is outstanding.
+ * Light weight out of order CPU model that approximates an out of
+ * order CPU.  It is separated into a front end and a back end, with
+ * the template parameter Impl describing the classes used for each.
+ * The goal is to be able to specify through the Impl the class to use
+ * for the front end and back end, with different classes used to
+ * model different levels of detail.
  */
-
 template <class Impl>
 class OzoneCPU : public BaseCPU
 {
@@ -98,6 +98,11 @@ class OzoneCPU : public BaseCPU
     typedef TheISA::MiscReg MiscReg;
 
   public:
+    /**
+     * The ExecContext for this CPU, which is used to provide the
+     * CPU's interface to any external objects.  Internally most of
+     * the CPU state is stored within the OzoneThreadState class.
+     */
     class OzoneXC : public ExecContext {
       public:
         OzoneCPU<Impl> *cpu;
@@ -235,14 +240,19 @@ class OzoneCPU : public BaseCPU
 #endif
     };
 
-    // execution context proxy
+    // ExecContext for OzoneCPU
     OzoneXC ozoneXC;
+
+    // ExecContext pointer that will be given to any external objects.
     ExecContext *xcProxy;
+
+    // ExecContext pointer to the CheckerCPU's ExecContext.
     ExecContext *checkerXC;
 
     typedef OzoneThreadState<Impl> ImplState;
 
   private:
+    // Committed thread state for the OzoneCPU.
     OzoneThreadState<Impl> thread;
 
   public:
@@ -280,12 +290,6 @@ class OzoneCPU : public BaseCPU
             tickEvent.squash();
     }
 
-  private:
-    Trace::InstRecord *traceData;
-
-    template<typename T>
-    void trace_data(T data);
-
   public:
     enum Status {
         Running,
@@ -361,6 +365,7 @@ class OzoneCPU : public BaseCPU
     FrontEnd *frontEnd;
 
     BackEnd *backEnd;
+
   private:
     Status status() const { return _status; }
     void setStatus(Status new_status) { _status = new_status; }
@@ -392,12 +397,11 @@ class OzoneCPU : public BaseCPU
     // number of idle cycles
     Stats::Average<> notIdleFraction;
     Stats::Formula idleFraction;
-  public:
 
+  public:
     virtual void serialize(std::ostream &os);
     virtual void unserialize(Checkpoint *cp, const std::string &section);
 
-
 #if FULL_SYSTEM
     bool validInstAddr(Addr addr) { return true; }
     bool validDataAddr(Addr addr) { return true; }
@@ -585,12 +589,9 @@ class OzoneCPU : public BaseCPU
 
     Fault copy(Addr dest);
 
-    InstSeqNum globalSeqNum;
-
   public:
     void squashFromXC();
 
-    // @todo: This can be a useful debug function.  Implement it.
     void dumpInsts() { frontEnd->dumpInsts(); }
 
 #if FULL_SYSTEM
@@ -608,7 +609,6 @@ class OzoneCPU : public BaseCPU
 
     ExecContext *xcBase() { return xcProxy; }
 
-    bool decoupledFrontEnd;
     struct CommStruct {
         InstSeqNum doneSeqNum;
         InstSeqNum nonSpecSeqNum;
@@ -617,8 +617,13 @@ class OzoneCPU : public BaseCPU
 
         bool stall;
     };
+
+    InstSeqNum globalSeqNum;
+
     TimeBuffer<CommStruct> comm;
 
+    bool decoupledFrontEnd;
+
     bool lockFlag;
 
     Stats::Scalar<> quiesceCycles;
diff --git a/cpu/ozone/cpu_impl.hh b/cpu/ozone/cpu_impl.hh
index 5675da3a8..4f41f220a 100644
--- a/cpu/ozone/cpu_impl.hh
+++ b/cpu/ozone/cpu_impl.hh
@@ -26,9 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//#include <cstdio>
-//#include <cstdlib>
-
 #include "arch/isa_traits.hh" // For MachInst
 #include "base/trace.hh"
 #include "config/full_system.hh"
@@ -39,7 +36,6 @@
 #include "cpu/ozone/cpu.hh"
 #include "cpu/quiesce_event.hh"
 #include "cpu/static_inst.hh"
-//#include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
@@ -50,7 +46,6 @@
 #include "arch/alpha/tlb.hh"
 #include "arch/vtophys.hh"
 #include "base/callback.hh"
-//#include "base/remote_gdb.hh"
 #include "cpu/profile.hh"
 #include "kern/kernel_stats.hh"
 #include "mem/functional/memory_control.hh"
@@ -67,15 +62,6 @@
 using namespace TheISA;
 
 template <class Impl>
-template<typename T>
-void
-OzoneCPU<Impl>::trace_data(T data) {
-    if (traceData) {
-        traceData->setData(data);
-    }
-}
-
-template <class Impl>
 OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w)
     : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w)
 {
@@ -104,7 +90,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     : BaseCPU(p), thread(this, 0, p->workload[0], 0), tickEvent(this, p->width),
       mem(p->workload[0]->getMemory()),
 #endif
-      comm(5, 5)
+      comm(5, 5), decoupledFrontEnd(p->decoupledFrontEnd)
 {
     frontEnd = new FrontEnd(p);
     backEnd = new BackEnd(p);
@@ -112,6 +98,9 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     _status = Idle;
 
     if (p->checker) {
+        // If checker is being used, get the checker from the params
+        // pointer, make the Checker's ExecContext, and setup the
+        // xcProxy to point to it.
         BaseCPU *temp_checker = p->checker;
         checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
         checker->setMemory(mem);
@@ -122,11 +111,17 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
         thread.xcProxy = checkerXC;
         xcProxy = checkerXC;
     } else {
+        // If checker is not being used, then the xcProxy points
+        // directly to the CPU's ExecContext.
         checker = NULL;
         thread.xcProxy = &ozoneXC;
         xcProxy = &ozoneXC;
     }
 
+    // Add xcProxy to CPU list of ExecContexts.
+    execContexts.push_back(xcProxy);
+
+    // Give the OzoneXC pointers to the CPU and the thread state.
     ozoneXC.cpu = this;
     ozoneXC.thread = &thread;
 
@@ -134,7 +129,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 
     thread.setStatus(ExecContext::Suspended);
 #if FULL_SYSTEM
-    /***** All thread state stuff *****/
+    // Setup thread state stuff.
     thread.cpu = this;
     thread.tid = 0;
     thread.mem = p->mem;
@@ -171,8 +166,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     numInst = 0;
     startNumInst = 0;
 
-    execContexts.push_back(xcProxy);
-
+    // Give pointers to the front and back end to all things they may need.
     frontEnd->setCPU(this);
     backEnd->setCPU(this);
 
@@ -188,12 +182,13 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
     frontEnd->setBackEnd(backEnd);
     backEnd->setFrontEnd(frontEnd);
 
-    decoupledFrontEnd = p->decoupledFrontEnd;
-
     globalSeqNum = 1;
 
     checkInterrupts = false;
 
+    lockFlag = 0;
+
+    // Setup rename table, initializing all values to ready.
     for (int i = 0; i < TheISA::TotalNumRegs; ++i) {
         thread.renameTable[i] = new DynInst(this);
         thread.renameTable[i]->setResultReady();
@@ -206,8 +201,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p)
 //    pTable = p->pTable;
 #endif
 
-    lockFlag = 0;
-
     DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n");
 }
 
@@ -231,14 +224,20 @@ template <class Impl>
 void
 OzoneCPU<Impl>::signalSwitched()
 {
+    // Only complete the switchout when both the front end and back
+    // end have signalled they are ready to switch.
     if (++switchCount == 2) {
         backEnd->doSwitchOut();
         frontEnd->doSwitchOut();
+
         if (checker)
             checker->switchOut(sampler);
+
         _status = SwitchedOut;
+
         if (tickEvent.scheduled())
             tickEvent.squash();
+
         sampler->signalSwitched();
     }
     assert(switchCount <= 2);
@@ -793,6 +792,7 @@ OzoneCPU<Impl>::OzoneXC::takeOverFrom(ExecContext *old_context)
         thread->quiesceEvent->xc = this;
     }
 
+    // Copy kernel stats pointer from old context.
     thread->kernelStats = old_context->getKernelStats();
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
@@ -814,7 +814,11 @@ OzoneCPU<Impl>::OzoneXC::regStats(const std::string &name)
 template <class Impl>
 void
 OzoneCPU<Impl>::OzoneXC::serialize(std::ostream &os)
-{ }
+{
+    // Once serialization is added, serialize the quiesce event and
+    // kernel stats.  Will need to make sure there aren't multiple
+    // things that serialize them.
+}
 
 template <class Impl>
 void
@@ -867,7 +871,6 @@ OzoneCPU<Impl>::OzoneXC::getThreadNum()
     return thread->tid;
 }
 
-// Also somewhat obnoxious.  Really only used for the TLB fault.
 template <class Impl>
 TheISA::MachInst
 OzoneCPU<Impl>::OzoneXC::getInst()
@@ -901,7 +904,7 @@ OzoneCPU<Impl>::OzoneXC::copyArchRegs(ExecContext *xc)
 
     // Need to copy the XC values into the current rename table,
     // copy the misc regs.
-    thread->regs.miscRegs.copyMiscRegs(xc);
+    TheISA::copyMiscRegs(xc, this);
 }
 
 template <class Impl>
diff --git a/cpu/ozone/inorder_back_end_impl.hh b/cpu/ozone/inorder_back_end_impl.hh
index 5a378ec76..cc92ec92f 100644
--- a/cpu/ozone/inorder_back_end_impl.hh
+++ b/cpu/ozone/inorder_back_end_impl.hh
@@ -257,7 +257,7 @@ InorderBackEnd<Impl>::executeInsts()
         }
 
         inst->setExecuted();
-        inst->setCompleted();
+        inst->setResultReady();
         inst->setCanCommit();
 
         instList.pop_front();
diff --git a/cpu/ozone/inst_queue_impl.hh b/cpu/ozone/inst_queue_impl.hh
index 0523c68d6..1b9fcdc84 100644
--- a/cpu/ozone/inst_queue_impl.hh
+++ b/cpu/ozone/inst_queue_impl.hh
@@ -848,13 +848,13 @@ template <class Impl>
 void
 InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
 {
-    OpClass op_class = ready_inst->opClass();
+//    OpClass op_class = ready_inst->opClass();
 
     readyInsts.push(ready_inst);
 
     DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
             "the ready list, PC %#x opclass:%i [sn:%lli].\n",
-            ready_inst->readPC(), op_class, ready_inst->seqNum);
+            ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum);
 }
 /*
 template <class Impl>
@@ -1175,11 +1175,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst)
             return;
         }
 
-        OpClass op_class = inst->opClass();
+//        OpClass op_class = inst->opClass();
 
         DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
                 "the ready list, PC %#x opclass:%i [sn:%lli].\n",
-                inst->readPC(), op_class, inst->seqNum);
+                inst->readPC(), inst->opClass(), inst->seqNum);
 
         readyInsts.push(inst);
     }
diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh
index 1c03ffb73..19f2b2b61 100644
--- a/cpu/ozone/lw_back_end.hh
+++ b/cpu/ozone/lw_back_end.hh
@@ -369,37 +369,37 @@ class LWBackEnd
 /*    Stats::Scalar<> dcacheStallCycles;
       Counter lastDcacheStall;
 */
-    Stats::Vector<> rob_cap_events;
-    Stats::Vector<> rob_cap_inst_count;
-    Stats::Vector<> iq_cap_events;
-    Stats::Vector<> iq_cap_inst_count;
+    Stats::Vector<> robCapEvents;
+    Stats::Vector<> robCapInstCount;
+    Stats::Vector<> iqCapEvents;
+    Stats::Vector<> iqCapInstCount;
     // total number of instructions executed
-    Stats::Vector<> exe_inst;
-    Stats::Vector<> exe_swp;
-    Stats::Vector<> exe_nop;
-    Stats::Vector<> exe_refs;
-    Stats::Vector<> exe_loads;
-    Stats::Vector<> exe_branches;
+    Stats::Vector<> exeInst;
+    Stats::Vector<> exeSwp;
+    Stats::Vector<> exeNop;
+    Stats::Vector<> exeRefs;
+    Stats::Vector<> exeLoads;
+    Stats::Vector<> exeBranches;
 
-    Stats::Vector<> issued_ops;
+    Stats::Vector<> issuedOps;
 
     // total number of loads forwaded from LSQ stores
-    Stats::Vector<> lsq_forw_loads;
+    Stats::Vector<> lsqForwLoads;
 
     // total number of loads ignored due to invalid addresses
-    Stats::Vector<> inv_addr_loads;
+    Stats::Vector<> invAddrLoads;
 
     // total number of software prefetches ignored due to invalid addresses
-    Stats::Vector<> inv_addr_swpfs;
+    Stats::Vector<> invAddrSwpfs;
     // ready loads blocked due to memory disambiguation
-    Stats::Vector<> lsq_blocked_loads;
+    Stats::Vector<> lsqBlockedLoads;
 
     Stats::Scalar<> lsqInversion;
 
-    Stats::Vector<> n_issued_dist;
-    Stats::VectorDistribution<> issue_delay_dist;
+    Stats::Vector<> nIssuedDist;
+    Stats::VectorDistribution<> issueDelayDist;
 
-    Stats::VectorDistribution<> queue_res_dist;
+    Stats::VectorDistribution<> queueResDist;
 /*
     Stats::Vector<> stat_fu_busy;
     Stats::Vector2d<> stat_fuBusy;
@@ -417,37 +417,37 @@ class LWBackEnd
     Stats::Formula commit_ipb;
     Stats::Formula lsq_inv_rate;
 */
-    Stats::Vector<> writeback_count;
-    Stats::Vector<> producer_inst;
-    Stats::Vector<> consumer_inst;
-    Stats::Vector<> wb_penalized;
+    Stats::Vector<> writebackCount;
+    Stats::Vector<> producerInst;
+    Stats::Vector<> consumerInst;
+    Stats::Vector<> wbPenalized;
 
-    Stats::Formula wb_rate;
-    Stats::Formula wb_fanout;
-    Stats::Formula wb_penalized_rate;
+    Stats::Formula wbRate;
+    Stats::Formula wbFanout;
+    Stats::Formula wbPenalizedRate;
 
     // total number of instructions committed
-    Stats::Vector<> stat_com_inst;
-    Stats::Vector<> stat_com_swp;
-    Stats::Vector<> stat_com_refs;
-    Stats::Vector<> stat_com_loads;
-    Stats::Vector<> stat_com_membars;
-    Stats::Vector<> stat_com_branches;
+    Stats::Vector<> statComInst;
+    Stats::Vector<> statComSwp;
+    Stats::Vector<> statComRefs;
+    Stats::Vector<> statComLoads;
+    Stats::Vector<> statComMembars;
+    Stats::Vector<> statComBranches;
 
-    Stats::Distribution<> n_committed_dist;
+    Stats::Distribution<> nCommittedDist;
 
-    Stats::Scalar<> commit_eligible_samples;
-    Stats::Vector<> commit_eligible;
+    Stats::Scalar<> commitEligibleSamples;
+    Stats::Vector<> commitEligible;
 
     Stats::Vector<> squashedInsts;
     Stats::Vector<> ROBSquashedInsts;
 
-    Stats::Scalar<> ROB_fcount;
-    Stats::Formula ROB_full_rate;
+    Stats::Scalar<> ROBFcount;
+    Stats::Formula ROBFullRate;
 
-    Stats::Vector<>  ROB_count;	 // cumulative ROB occupancy
-    Stats::Formula ROB_occ_rate;
-    Stats::VectorDistribution<> ROB_occ_dist;
+    Stats::Vector<>  ROBCount;	 // cumulative ROB occupancy
+    Stats::Formula ROBOccRate;
+    Stats::VectorDistribution<> ROBOccDist;
   public:
     void dumpInsts();
 
diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh
index 41b4ea24b..18b2e8f47 100644
--- a/cpu/ozone/lw_back_end_impl.hh
+++ b/cpu/ozone/lw_back_end_impl.hh
@@ -251,78 +251,77 @@ void
 LWBackEnd<Impl>::regStats()
 {
     using namespace Stats;
-    rob_cap_events
+    robCapEvents
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:cap_events")
         .desc("number of cycles where ROB cap was active")
         .flags(total)
         ;
 
-    rob_cap_inst_count
+    robCapInstCount
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:cap_inst")
         .desc("number of instructions held up by ROB cap")
         .flags(total)
         ;
 
-    iq_cap_events
+    iqCapEvents
         .init(cpu->number_of_threads)
         .name(name() +".IQ:cap_events" )
         .desc("number of cycles where IQ cap was active")
         .flags(total)
         ;
 
-    iq_cap_inst_count
+    iqCapInstCount
         .init(cpu->number_of_threads)
         .name(name() + ".IQ:cap_inst")
         .desc("number of instructions held up by IQ cap")
         .flags(total)
         ;
 
-
-    exe_inst
+    exeInst
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:count")
         .desc("number of insts issued")
         .flags(total)
         ;
 
-    exe_swp
+    exeSwp
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:swp")
         .desc("number of swp insts issued")
         .flags(total)
         ;
 
-    exe_nop
+    exeNop
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:nop")
         .desc("number of nop insts issued")
         .flags(total)
         ;
 
-    exe_refs
+    exeRefs
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:refs")
         .desc("number of memory reference insts issued")
         .flags(total)
         ;
 
-    exe_loads
+    exeLoads
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:loads")
         .desc("number of load insts issued")
         .flags(total)
         ;
 
-    exe_branches
+    exeBranches
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:branches")
         .desc("Number of branches issued")
         .flags(total)
         ;
 
-    issued_ops
+    issuedOps
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:op_count")
         .desc("number of insts issued")
@@ -339,28 +338,28 @@ LWBackEnd<Impl>::regStats()
     //
     //  Other stats
     //
-    lsq_forw_loads
+    lsqForwLoads
         .init(cpu->number_of_threads)
         .name(name() + ".LSQ:forw_loads")
         .desc("number of loads forwarded via LSQ")
         .flags(total)
         ;
 
-    inv_addr_loads
+    invAddrLoads
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:addr_loads")
         .desc("number of invalid-address loads")
         .flags(total)
         ;
 
-    inv_addr_swpfs
+    invAddrSwpfs
         .init(cpu->number_of_threads)
         .name(name() + ".ISSUE:addr_swpfs")
         .desc("number of invalid-address SW prefetches")
         .flags(total)
         ;
 
-    lsq_blocked_loads
+    lsqBlockedLoads
         .init(cpu->number_of_threads)
         .name(name() + ".LSQ:blocked_loads")
         .desc("number of ready loads not issued due to memory disambiguation")
@@ -372,51 +371,51 @@ LWBackEnd<Impl>::regStats()
         .desc("Number of times LSQ instruction issued early")
         ;
 
-    n_issued_dist
+    nIssuedDist
         .init(issueWidth + 1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
         .flags(total | pdf | dist)
         ;
-    issue_delay_dist
+    issueDelayDist
         .init(Num_OpClasses,0,99,2)
         .name(name() + ".ISSUE:")
         .desc("cycles from operands ready to issue")
         .flags(pdf | cdf)
         ;
 
-    queue_res_dist
+    queueResDist
         .init(Num_OpClasses, 0, 99, 2)
         .name(name() + ".IQ:residence:")
         .desc("cycles from dispatch to issue")
         .flags(total | pdf | cdf )
         ;
     for (int i = 0; i < Num_OpClasses; ++i) {
-        queue_res_dist.subname(i, opClassStrings[i]);
+        queueResDist.subname(i, opClassStrings[i]);
     }
 
-    writeback_count
+    writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
         .flags(total)
         ;
 
-    producer_inst
+    producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
         .flags(total)
         ;
 
-    consumer_inst
+    consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
         .flags(total)
         ;
 
-    wb_penalized
+    wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
@@ -424,71 +423,71 @@ LWBackEnd<Impl>::regStats()
         ;
 
 
-    wb_penalized_rate
+    wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate = wb_penalized / writeback_count;
+    wbPenalizedRate = wbPenalized / writebackCount;
 
-    wb_fanout
+    wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
         .flags(total)
         ;
 
-    wb_fanout = producer_inst / consumer_inst;
+    wbFanout = producerInst / consumerInst;
 
-    wb_rate
+    wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
         .flags(total)
         ;
-    wb_rate = writeback_count / cpu->numCycles;
+    wbRate = writebackCount / cpu->numCycles;
 
-    stat_com_inst
+    statComInst
         .init(cpu->number_of_threads)
         .name(name() + ".COM:count")
         .desc("Number of instructions committed")
         .flags(total)
         ;
 
-    stat_com_swp
+    statComSwp
         .init(cpu->number_of_threads)
         .name(name() + ".COM:swp_count")
         .desc("Number of s/w prefetches committed")
         .flags(total)
         ;
 
-    stat_com_refs
+    statComRefs
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:refs")
         .desc("Number of memory references committed")
         .flags(total)
         ;
 
-    stat_com_loads
+    statComLoads
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:loads")
         .desc("Number of loads committed")
         .flags(total)
         ;
 
-    stat_com_membars
+    statComMembars
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:membars")
         .desc("Number of memory barriers committed")
         .flags(total)
         ;
 
-    stat_com_branches
+    statComBranches
         .init(cpu->number_of_threads)
         .name(name() + ".COM:branches")
         .desc("Number of branches committed")
         .flags(total)
         ;
-    n_committed_dist
+    nCommittedDist
         .init(0,commitWidth,1)
         .name(name() + ".COM:committed_per_cycle")
         .desc("Number of insts commited each cycle")
@@ -508,14 +507,14 @@ LWBackEnd<Impl>::regStats()
     //  -> The standard deviation is computed only over cycles where
     //  we reached the BW limit
     //
-    commit_eligible
+    commitEligible
         .init(cpu->number_of_threads)
         .name(name() + ".COM:bw_limited")
         .desc("number of insts not committed due to BW limits")
         .flags(total)
         ;
 
-    commit_eligible_samples
+    commitEligibleSamples
         .name(name() + ".COM:bw_lim_events")
         .desc("number cycles where commit BW limit reached")
         ;
@@ -532,32 +531,32 @@ LWBackEnd<Impl>::regStats()
         .desc("Number of instructions removed from inst list when they reached the head of the ROB")
         ;
 
-    ROB_fcount
+    ROBFcount
         .name(name() + ".ROB:full_count")
         .desc("number of cycles where ROB was full")
         ;
 
-    ROB_count
+    ROBCount
         .init(cpu->number_of_threads)
         .name(name() + ".ROB:occupancy")
         .desc(name() + ".ROB occupancy (cumulative)")
         .flags(total)
         ;
 
-    ROB_full_rate
+    ROBFullRate
         .name(name() + ".ROB:full_rate")
         .desc("ROB full per cycle")
         ;
-    ROB_full_rate = ROB_fcount / cpu->numCycles;
+    ROBFullRate = ROBFcount / cpu->numCycles;
 
-    ROB_occ_rate
+    ROBOccRate
         .name(name() + ".ROB:occ_rate")
         .desc("ROB occupancy rate")
         .flags(total)
         ;
-    ROB_occ_rate = ROB_count / cpu->numCycles;
+    ROBOccRate = ROBCount / cpu->numCycles;
 
-    ROB_occ_dist
+    ROBOccDist
         .init(cpu->number_of_threads,0,numROBEntries,2)
         .name(name() + ".ROB:occ_dist")
         .desc("ROB Occupancy per cycle")
@@ -660,7 +659,7 @@ LWBackEnd<Impl>::tick()
         return;
     }
 
-    ROB_count[0]+= numInsts;
+    ROBCount[0]+= numInsts;
 
     wbCycle = 0;
 
@@ -980,8 +979,8 @@ LWBackEnd<Impl>::executeInsts()
         }
     }
 
-    issued_ops[0]+= num_executed;
-    n_issued_dist[num_executed]++;
+    issuedOps[0]+= num_executed;
+    nIssuedDist[num_executed]++;
 }
 
 template<class Impl>
@@ -1002,13 +1001,13 @@ LWBackEnd<Impl>::instToCommit(DynInstPtr &inst)
             inst->setResultReady();
             int dependents = wakeDependents(inst);
             if (dependents) {
-                producer_inst[0]++;
-                consumer_inst[0]+= dependents;
+                producerInst[0]++;
+                consumerInst[0]+= dependents;
             }
         }
     }
 
-    writeback_count[0]++;
+    writebackCount[0]++;
 }
 #if 0
 template <class Impl>
@@ -1076,7 +1075,7 @@ LWBackEnd<Impl>::commitInst(int inst_num)
 
     thread->setPC(inst->readPC());
     thread->setNextPC(inst->readNextPC());
-    inst->reachedCommit = true;
+    inst->setAtCommit();
 
     // If the instruction is not executed yet, then it is a non-speculative
     // or store inst.  Signal backwards that it should be executed.
@@ -1229,6 +1228,9 @@ LWBackEnd<Impl>::commitInst(int inst_num)
         inst->traceData = NULL;
     }
 
+    if (inst->isCopy())
+        panic("Should not commit any copy instructions!");
+
     inst->clearDependents();
 
     frontEnd->addFreeRegs(freed_regs);
@@ -1292,7 +1294,7 @@ LWBackEnd<Impl>::commitInsts()
             break;
         }
     }
-    n_committed_dist.sample(inst_num);
+    nCommittedDist.sample(inst_num);
 }
 
 template <class Impl>
@@ -1344,7 +1346,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn)
 
         (*insts_it)->setCanCommit();
 
-        (*insts_it)->removeInROB();
+        (*insts_it)->clearInROB();
 
         for (int i = 0; i < (*insts_it)->numDestRegs(); ++i) {
             DynInstPtr prev_dest = (*insts_it)->getPrevDestInst(i);
@@ -1522,27 +1524,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exe_swp[thread_number]++;
+        exeSwp[thread_number]++;
     else
-        exe_inst[thread_number]++;
+        exeInst[thread_number]++;
 #else
-    exe_inst[thread_number]++;
+    exeInst[thread_number]++;
 #endif
 
     //
     //  Control operations
     //
     if (inst->isControl())
-        exe_branches[thread_number]++;
+        exeBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exe_refs[thread_number]++;
+        exeRefs[thread_number]++;
 
         if (inst->isLoad())
-            exe_loads[thread_number]++;
+            exeLoads[thread_number]++;
     }
 }
 
@@ -1562,33 +1564,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch()) {
-        stat_com_swp[tid]++;
+        statComSwp[tid]++;
     } else {
-        stat_com_inst[tid]++;
+        statComInst[tid]++;
     }
 #else
-    stat_com_inst[tid]++;
+    statComInst[tid]++;
 #endif
 
     //
     //  Control Instructions
     //
     if (inst->isControl())
-        stat_com_branches[tid]++;
+        statComBranches[tid]++;
 
     //
     //  Memory references
     //
     if (inst->isMemRef()) {
-        stat_com_refs[tid]++;
+        statComRefs[tid]++;
 
         if (inst->isLoad()) {
-            stat_com_loads[tid]++;
+            statComLoads[tid]++;
         }
     }
 
     if (inst->isMemBarrier()) {
-        stat_com_membars[tid]++;
+        statComMembars[tid]++;
     }
 }
 
diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh
index 6fe343b42..c0bf0b0fe 100644
--- a/cpu/ozone/lw_lsq.hh
+++ b/cpu/ozone/lw_lsq.hh
@@ -447,7 +447,7 @@ OzoneLWLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // too).
     // @todo: Fix uncached accesses.
     if (req->flags & UNCACHEABLE &&
-        (inst != loadQueue.back() || !inst->reachedCommit)) {
+        (inst != loadQueue.back() || !inst->isAtCommit())) {
         DPRINTF(OzoneLSQ, "[sn:%lli] Uncached load and not head of "
                 "commit/LSQ!\n",
                 inst->seqNum);
diff --git a/cpu/ozone/thread_state.hh b/cpu/ozone/thread_state.hh
index c86c3a720..f104dff23 100644
--- a/cpu/ozone/thread_state.hh
+++ b/cpu/ozone/thread_state.hh
@@ -182,8 +182,6 @@ struct OzoneThreadState : public ThreadState {
     void setNextPC(uint64_t val)
     { nextPC = val; }
 
-    bool misspeculating() { return false; }
-
     void setInst(TheISA::MachInst _inst) { inst = _inst; }
 
     Counter readFuncExeInst() { return funcExeInst; }
diff --git a/cpu/thread_state.hh b/cpu/thread_state.hh
index e09cb12fd..12146bd11 100644
--- a/cpu/thread_state.hh
+++ b/cpu/thread_state.hh
@@ -60,6 +60,7 @@ struct ThreadState {
         : cpuId(_cpuId), tid(_tid), mem(_mem), process(_process), asid(_asid)
 #endif
     {
+        numInst = 0;
         funcExeInst = 0;
         storeCondFailures = 0;
     }
diff --git a/python/m5/objects/AlphaFullCPU.py b/python/m5/objects/AlphaFullCPU.py
index 043c3c08f..015e9d872 100644
--- a/python/m5/objects/AlphaFullCPU.py
+++ b/python/m5/objects/AlphaFullCPU.py
@@ -39,12 +39,10 @@ class DerivAlphaFullCPU(BaseCPU):
                "Issue/Execute/Writeback delay")
     issueToExecuteDelay = Param.Unsigned("Issue to execute delay (internal "
               "to the IEW stage)")
-    issueWidth = Param.Unsigned("Issue width")
-    executeWidth = Param.Unsigned("Execute width")
-    executeIntWidth = Param.Unsigned("Integer execute width")
-    executeFloatWidth = Param.Unsigned("Floating point execute width")
-    executeBranchWidth = Param.Unsigned("Branch execute width")
-    executeMemoryWidth = Param.Unsigned("Memory execute width")
+    dispatchWidth = Param.Unsigned(8, "Dispatch width")
+    issueWidth = Param.Unsigned(8, "Issue width")
+    wbWidth = Param.Unsigned(8, "Writeback width")
+    wbDepth = Param.Unsigned(1, "Writeback depth")
     fuPool = Param.FUPool(NULL, "Functional Unit pool")
 
     iewToCommitDelay = Param.Unsigned("Issue/Execute/Writeback to commit "
@@ -55,6 +53,9 @@ class DerivAlphaFullCPU(BaseCPU):
     trapLatency = Param.Tick("Trap latency")
     fetchTrapLatency = Param.Tick("Fetch trap latency")
 
+    backComSize = Param.Unsigned(5, "Time buffer size for backwards communication")
+    forwardComSize = Param.Unsigned(5, "Time buffer size for forward communication")
+
     predType = Param.String("Branch predictor type ('local', 'tournament')")
     localPredictorSize = Param.Unsigned("Size of local predictor")
     localCtrBits = Param.Unsigned("Bits per counter")