24 files changed, 414 insertions, 270 deletions
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 08d42cd46..c563fbef3 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -94,12 +94,10 @@ Param<unsigned> renameWidth;
 Param<unsigned> commitToIEWDelay;
 Param<unsigned> renameToIEWDelay;
 Param<unsigned> issueToExecuteDelay;
+Param<unsigned> dispatchWidth;
 Param<unsigned> issueWidth;
-Param<unsigned> executeWidth;
-Param<unsigned> executeIntWidth;
-Param<unsigned> executeFloatWidth;
-Param<unsigned> executeBranchWidth;
-Param<unsigned> executeMemoryWidth;
+Param<unsigned> wbWidth;
+Param<unsigned> wbDepth;
 SimObjectParam<FUPool *> fuPool;
 
 Param<unsigned> iewToCommitDelay;
@@ -109,6 +107,9 @@ Param<unsigned> squashWidth;
 Param<Tick> trapLatency;
 Param<Tick> fetchTrapLatency;
 
+Param<unsigned> backComSize;
+Param<unsigned> forwardComSize;
+
 Param<std::string> predType;
 Param<unsigned> localPredictorSize;
 Param<unsigned> localCtrBits;
@@ -219,12 +220,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
                "Issue/Execute/Writeback delay"),
     INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
                "to the IEW stage)"),
+    INIT_PARAM(dispatchWidth, "Dispatch width"),
     INIT_PARAM(issueWidth, "Issue width"),
-    INIT_PARAM(executeWidth, "Execute width"),
-    INIT_PARAM(executeIntWidth, "Integer execute width"),
-    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
-    INIT_PARAM(executeBranchWidth, "Branch execute width"),
-    INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+    INIT_PARAM(wbWidth, "Writeback width"),
+    INIT_PARAM(wbDepth, "Writeback depth (number of cycles it can buffer)"),
     INIT_PARAM_DFLT(fuPool, "Functional unit pool", NULL),
 
     INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
@@ -235,6 +234,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM_DFLT(trapLatency, "Number of cycles before the trap is handled", 6),
     INIT_PARAM_DFLT(fetchTrapLatency, "Number of cycles before the fetch trap is handled", 12),
 
+    INIT_PARAM(backComSize, "Time buffer size for backwards communication"),
+    INIT_PARAM(forwardComSize, "Time buffer size for forward communication"),
+
     INIT_PARAM(predType, "Type of branch predictor ('local', 'tournament')"),
     INIT_PARAM(localPredictorSize, "Size of local predictor"),
     INIT_PARAM(localCtrBits, "Bits per counter"),
@@ -353,12 +355,10 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->commitToIEWDelay = commitToIEWDelay;
     params->renameToIEWDelay = renameToIEWDelay;
     params->issueToExecuteDelay = issueToExecuteDelay;
+    params->dispatchWidth = dispatchWidth;
     params->issueWidth = issueWidth;
-    params->executeWidth = executeWidth;
-    params->executeIntWidth = executeIntWidth;
-    params->executeFloatWidth = executeFloatWidth;
-    params->executeBranchWidth = executeBranchWidth;
-    params->executeMemoryWidth = executeMemoryWidth;
+    params->wbWidth = wbWidth;
+    params->wbDepth = wbDepth;
     params->fuPool = fuPool;
 
     params->iewToCommitDelay = iewToCommitDelay;
@@ -368,6 +368,9 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->trapLatency = trapLatency;
     params->fetchTrapLatency = fetchTrapLatency;
 
+    params->backComSize = backComSize;
+    params->forwardComSize = forwardComSize;
+
     params->predType = predType;
     params->localPredictorSize = localPredictorSize;
     params->localCtrBits = localCtrBits;
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index f39fdf6b6..1bf0652cd 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -383,7 +383,7 @@ AlphaFullCPU<Impl>::AlphaXC::copyArchRegs(ExecContext *xc)
     }
 
     // Copy the misc regs.
-    cpu->regFile.miscRegs[tid].copyMiscRegs(xc);
+    TheISA::copyMiscRegs(xc, this);
 
     // Then finally set the PC and the next PC.
     cpu->setPC(xc->readPC(), tid);
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index f0836a9fd..4ab130d02 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -106,12 +106,10 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned commitToIEWDelay;
     unsigned renameToIEWDelay;
     unsigned issueToExecuteDelay;
+    unsigned dispatchWidth;
     unsigned issueWidth;
-    unsigned executeWidth;
-    unsigned executeIntWidth;
-    unsigned executeFloatWidth;
-    unsigned executeBranchWidth;
-    unsigned executeMemoryWidth;
+    unsigned wbWidth;
+    unsigned wbDepth;
     FUPool *fuPool;
 
     //
@@ -125,6 +123,12 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     Tick fetchTrapLatency;
 
     //
+    // Timebuffer sizes
+    //
+    unsigned backComSize;
+    unsigned forwardComSize;
+
+    //
     // Branch predictor (BP, BTB, RAS)
     //
     std::string predType;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index d93822394..b153effc4 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -160,10 +160,6 @@ class DefaultCommit
     /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
-    void setFetchStage(Fetch *fetch_stage);
-
-    Fetch *fetchStage;
-
     /** Sets the pointer to the IEW stage. */
     void setIEWStage(IEW *iew_stage);
 
@@ -367,11 +363,6 @@ class DefaultCommit
      */
     unsigned renameWidth;
 
-    /** IEW width, in instructions.  Used so ROB knows how many
-     *  instructions to get from the IEW instruction queue.
-     */
-    unsigned iewWidth;
-
     /** Commit width, in instructions. */
     unsigned commitWidth;
 
@@ -392,10 +383,6 @@ class DefaultCommit
      */
     Tick trapLatency;
 
-    Tick fetchTrapLatency;
-
-    Tick fetchFaultTick;
-
     /** The commit PC of each thread.  Refers to the instruction that
      * is currently being processed/committed.
      */
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 798f30294..364e685c2 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -71,12 +71,10 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       renameToROBDelay(params->renameToROBDelay),
       fetchToCommitDelay(params->commitToFetchDelay),
       renameWidth(params->renameWidth),
-      iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
       numThreads(params->numberOfThreads),
       switchedOut(false),
-      trapLatency(params->trapLatency),
-      fetchTrapLatency(params->fetchTrapLatency)
+      trapLatency(params->trapLatency)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -114,10 +112,8 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
         changedROBNumEntries[i] = false;
         trapSquash[i] = false;
         xcSquash[i] = false;
+        PC[i] = nextPC[i] = 0;
     }
-
-    fetchFaultTick = 0;
-    fetchTrapWait = 0;
 }
 
 template <class Impl>
@@ -240,7 +236,6 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     cpu->activateStage(FullCPU::CommitIdx);
 
     trapLatency = cpu->cycles(trapLatency);
-    fetchTrapLatency = cpu->cycles(fetchTrapLatency);
 }
 
 template <class Impl>
@@ -299,13 +294,6 @@ DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template <class Impl>
 void
-DefaultCommit<Impl>::setFetchStage(Fetch *fetch_stage)
-{
-    fetchStage = fetch_stage;
-}
-
-template <class Impl>
-void
 DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
 {
     iewStage = iew_stage;
@@ -431,7 +419,7 @@ DefaultCommit<Impl>::setNextStatus()
         }
     }
 
-    assert(squashes == squashCounter);
+    squashCounter = squashes;
 
     // If commit is currently squashing, then it will have activity for the
     // next cycle. Set its next status as active.
@@ -536,8 +524,6 @@ DefaultCommit<Impl>::squashFromTrap(unsigned tid)
 
     commitStatus[tid] = ROBSquashing;
     cpu->activityThisCycle();
-
-    ++squashCounter;
 }
 
 template <class Impl>
@@ -555,8 +541,6 @@ DefaultCommit<Impl>::squashFromXC(unsigned tid)
     cpu->activityThisCycle();
 
     xcSquash[tid] = false;
-
-    ++squashCounter;
 }
 
 template <class Impl>
@@ -571,6 +555,9 @@ DefaultCommit<Impl>::tick()
         return;
     }
 
+    if ((*activeThreads).size() <=0)
+        return;
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
     // Check if any of the threads are done squashing.  Change the
@@ -582,10 +569,12 @@ DefaultCommit<Impl>::tick()
 
             if (rob->isDoneSquashing(tid)) {
                 commitStatus[tid] = Running;
-                --squashCounter;
             } else {
                 DPRINTF(Commit,"[tid:%u]: Still Squashing, cannot commit any"
-                        "insts this cycle.\n", tid);
+                        " insts this cycle.\n", tid);
+                rob->doSquash(tid);
+                toIEW->commitInfo[tid].robSquashing = true;
+                wroteToTimeBuffer = true;
             }
         }
     }
@@ -691,29 +680,7 @@ DefaultCommit<Impl>::commit()
 
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
-/*
-        if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
-            // Record the fault.  Wait until it's empty in the ROB.
-            // Then handle the trap.  Ignore it if there's already a
-            // trap pending as fetch will be redirected.
-            fetchFault = fromFetch->fetchFault;
-            fetchFaultTick = curTick + fetchTrapLatency;
-            commitStatus[0] = FetchTrapPending;
-            DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
-                    "ROB empties without squashing the fault.\n");
-            fetchTrapWait = 0;
-        }
 
-        // Fetch may tell commit to clear the trap if it's been squashed.
-        if (fromFetch->clearFetchFault) {
-            DPRINTF(Commit, "Received clear fetch fault signal\n");
-            fetchTrapWait = 0;
-            if (commitStatus[0] == FetchTrapPending) {
-                DPRINTF(Commit, "Clearing fault from fetch\n");
-                commitStatus[0] = Running;
-            }
-        }
-*/
         // Not sure which one takes priority.  I think if we have
         // both, that's a bad sign.
         if (trapSquash[tid] == true) {
@@ -741,8 +708,6 @@ DefaultCommit<Impl>::commit()
 
             commitStatus[tid] = ROBSquashing;
 
-            ++squashCounter;
-
             // If we want to include the squashing instruction in the squash,
             // then use one older sequence number.
             InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid];
@@ -944,7 +909,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         // and committed this instruction.
         thread[tid]->funcExeInst--;
 
-        head_inst->reachedCommit = true;
+        head_inst->setAtCommit();
 
         if (head_inst->isNonSpeculative() ||
             head_inst->isStoreConditional() ||
@@ -1060,7 +1025,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
         // Generate trap squash event.
         generateTrapEvent(tid);
-
+//        warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC());
         return false;
 #else // !FULL_SYSTEM
         panic("fault (%d) detected @ PC %08p", inst_fault,
@@ -1083,6 +1048,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                                  head_inst->renamedDestRegIdx(i));
     }
 
+    if (head_inst->isCopy())
+        panic("Should not commit any copy instructions!");
+
     // Finally clear the head ROB entry.
     rob->retireHead(tid);
 
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 8d72bdc41..f1571e61b 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -108,12 +108,14 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       // For now just have these time buffers be pretty big.
       // @todo: Make these time buffer sizes parameters or derived
       // from latencies
-      timeBuffer(5, 5),
-      fetchQueue(5, 5),
-      decodeQueue(5, 5),
-      renameQueue(5, 5),
-      iewQueue(5, 5),
-      activityRec(NumStages, 10, params->activity),
+      timeBuffer(params->backComSize, params->forwardComSize),
+      fetchQueue(params->backComSize, params->forwardComSize),
+      decodeQueue(params->backComSize, params->forwardComSize),
+      renameQueue(params->backComSize, params->forwardComSize),
+      iewQueue(params->backComSize, params->forwardComSize),
+      activityRec(NumStages,
+                  params->backComSize + params->forwardComSize,
+                  params->activity),
 
       globalSeqNum(1),
 
@@ -180,7 +182,6 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
-    commit.setFetchStage(&fetch);
     commit.setIEWStage(&iew);
     rename.setIEWStage(&iew);
     rename.setCommitStage(&commit);
@@ -709,7 +710,7 @@ void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
     // Flush out any old data from the time buffers.
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < timeBuffer.getSize(); ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
         decodeQueue.advance();
@@ -758,6 +759,46 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
         tickEvent.schedule(curTick);
 }
 
+/*
+template <class Impl>
+void
+FullO3CPU<Impl>::serialize(std::ostream &os)
+{
+    BaseCPU::serialize(os);
+    nameOut(os, csprintf("%s.tickEvent", name()));
+    tickEvent.serialize(os);
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // write out the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    for (int i = 0; i < thread.size(); i++) {
+        nameOut(os, csprintf("%s.xc.%i", name(), i));
+        temp.copyXC(thread[i]->getXC());
+        temp.serialize(os);
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::unserialize(Checkpoint *cp, const std::string &section)
+{
+    BaseCPU::unserialize(cp, section);
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+
+    // Use SimpleThread's ability to checkpoint to make it easier to
+    // read in the registers.  Also make this static so it doesn't
+    // get instantiated multiple times (causes a panic in statistics).
+    static SimpleThread temp;
+
+    for (int i = 0; i < thread.size(); i++) {
+        temp.copyXC(thread[i]->getXC());
+        temp.unserialize(cp, csprintf("%s.xc.%i", section, i));
+        thread[i]->getXC()->copyArchRegs(temp.getXC());
+    }
+}
+*/
 template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readIntReg(int reg_idx)
@@ -866,7 +907,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegSingle(int reg_idx, float val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegSingle(phys_reg, val);
 }
@@ -875,7 +917,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegDouble(int reg_idx, double val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegDouble(phys_reg, val);
 }
@@ -884,7 +927,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     regFile.setFloatRegInt(phys_reg, val);
 }
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index f4b19bfb3..ef5c9ae53 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -63,6 +63,12 @@ class BaseFullCPU : public BaseCPU
 
     void regStats();
 
+    /** Sets this CPU's ID. */
+    void setCpuId(int id) { cpu_id = id; }
+
+    /** Reads this CPU's ID. */
+    int readCpuId() { return cpu_id; }
+
   protected:
     int cpu_id;
 };
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index 0b686375e..e1af4d423 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -278,7 +278,7 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
     toFetch->decodeInfo[tid].doneSeqNum = inst->seqNum;
     toFetch->decodeInfo[tid].predIncorrect = true;
     toFetch->decodeInfo[tid].squash = true;
-    toFetch->decodeInfo[tid].nextPC = inst->readNextPC();
+    toFetch->decodeInfo[tid].nextPC = inst->branchTarget();
     toFetch->decodeInfo[tid].branchTaken =
         inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst));
 
@@ -294,7 +294,7 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
     for (int i=0; i<fromFetch->size; i++) {
         if (fromFetch->insts[i]->threadNumber == tid &&
             fromFetch->insts[i]->seqNum > inst->seqNum) {
-            fromFetch->insts[i]->squashed = true;
+            fromFetch->insts[i]->setSquashed();
         }
     }
 
@@ -343,7 +343,7 @@ DefaultDecode<Impl>::squash(unsigned tid)
 
     for (int i=0; i<fromFetch->size; i++) {
         if (fromFetch->insts[i]->threadNumber == tid) {
-            fromFetch->insts[i]->squashed = true;
+            fromFetch->insts[i]->setSquashed();
             squash_count++;
         }
     }
@@ -721,9 +721,8 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid)
         // Go ahead and compute any PC-relative branches.
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
             ++decodeBranchResolved;
-            inst->setNextPC(inst->branchTarget());
 
-            if (inst->mispredicted()) {
+            if (inst->branchTarget() != inst->readPredTarg()) {
                 ++decodeBranchMispred;
 
                 // Might want to set some sort of boolean and just do
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index 92a87ab54..0bde56ce9 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -358,6 +358,12 @@ class DefaultFetch
     /** The cache line being fetched. */
     uint8_t *cacheData[Impl::MaxThreads];
 
+    /** The PC of the cacheline that has been loaded. */
+    Addr cacheDataPC[Impl::MaxThreads];
+
+    /** Whether or not the cache data is valid. */
+    bool cacheDataValid[Impl::MaxThreads];
+
     /** Size of instructions. */
     int instSize;
 
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index a309bd49a..cc09c4a41 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -138,6 +138,8 @@ DefaultFetch<Impl>::DefaultFetch(Params *params)
 
         // Create space to store a cache line.
         cacheData[tid] = new uint8_t[cacheBlkSize];
+        cacheDataPC[tid] = 0;
+        cacheDataValid[tid] = false;
 
         stalls[tid].decode = 0;
         stalls[tid].rename = 0;
@@ -334,6 +336,7 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Wake up the CPU (if it went to sleep and was waiting on this completion
     // event).
     cpu->wakeCPU();
+    cacheDataValid[tid] = true;
 
     DPRINTF(Activity, "[tid:%u] Activating fetch due to cache completion\n",
             tid);
@@ -466,7 +469,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    if (interruptPending && flags == 0 || switchedOut) {
+    if (interruptPending && flags == 0) {
         // Hold off fetch from getting new instructions while an interrupt
         // is pending.
         return false;
@@ -475,6 +478,11 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // Align the fetch PC so it's at the start of a cache block.
     fetch_PC = icacheBlockAlignPC(fetch_PC);
 
+    // If we've already got the block, no need to try to fetch it again.
+    if (cacheDataValid[tid] && fetch_PC == cacheDataPC[tid]) {
+        return true;
+    }
+
     // Setup the memReq to do a read of the first instruction's address.
     // Set the appropriate read size and flags as well.
     memReq[tid] = new MemReq();
@@ -525,6 +533,9 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 
             MemAccessResult result = icacheInterface->access(memReq[tid]);
 
+            cacheDataPC[tid] = fetch_PC;
+            cacheDataValid[tid] = false;
+
             fetchedCacheLines++;
 
             // If the cache missed, then schedule an event to wake
@@ -1002,8 +1013,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("%lli: Quiesce instruction encountered, halting fetch!",
-                     curTick);
+//                warn("%lli: Quiesce instruction encountered, halting fetch!",
+//                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1067,7 +1078,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
-        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
+//        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index eda6a6bc0..d21c573fe 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -224,6 +224,47 @@ class DefaultIEW
     /** Returns if the LSQ has any stores to writeback. */
     bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
 
+    void incrWb(InstSeqNum &sn)
+    {
+        if (++wbOutstanding == wbMax)
+            ableToIssue = false;
+        DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding <= wbMax);
+#ifdef DEBUG
+        wbList.insert(sn);
+#endif
+    }
+
+    void decrWb(InstSeqNum &sn)
+    {
+        if (wbOutstanding-- == wbMax)
+            ableToIssue = true;
+        DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding);
+        assert(wbOutstanding >= 0);
+#ifdef DEBUG
+        assert(wbList.find(sn) != wbList.end());
+        wbList.erase(sn);
+#endif
+    }
+
+#ifdef DEBUG
+    std::set<InstSeqNum> wbList;
+
+    void dumpWb()
+    {
+        std::set<InstSeqNum>::iterator wb_it = wbList.begin();
+        while (wb_it != wbList.end()) {
+            cprintf("[sn:%lli]\n",
+                    (*wb_it));
+            wb_it++;
+        }
+    }
+#endif
+
+    bool canIssue() { return ableToIssue; }
+
+    bool ableToIssue;
+
   private:
     /** Sends commit proper information for a squash due to a branch
      * mispredict.
@@ -281,6 +322,9 @@ class DefaultIEW
     /** Processes inputs and changes state accordingly. */
     void checkSignalsAndUpdate(unsigned tid);
 
+    /** Removes instructions from rename from a thread's instruction list. */
+    void emptyRenameInsts(unsigned tid);
+
     /** Sorts instructions coming from rename into lists separated by thread. */
     void sortInsts();
 
@@ -401,20 +445,12 @@ class DefaultIEW
      */
     unsigned issueToExecuteDelay;
 
-    /** Width of issue's read path, in instructions.  The read path is both
-     *  the skid buffer and the rename instruction queue.
-     *  Note to self: is this really different than issueWidth?
-     */
-    unsigned issueReadWidth;
+    /** Width of dispatch, in instructions. */
+    unsigned dispatchWidth;
 
     /** Width of issue, in instructions. */
     unsigned issueWidth;
 
-    /** Width of execute, in instructions.  Might make more sense to break
-     *  down into FP vs int.
-     */
-    unsigned executeWidth;
-
     /** Index into queue of instructions being written back. */
     unsigned wbNumInst;
 
@@ -425,6 +461,17 @@ class DefaultIEW
      */
     unsigned wbCycle;
 
+    /** Number of instructions in flight that will writeback. */
+    int wbOutstanding;
+
+    /** Writeback width. */
+    unsigned wbWidth;
+
+    /** Writeback width * writeback depth, where writeback depth is
+     * the number of cycles of writing back instructions that can be
+     * buffered. */
+    unsigned wbMax;
+
     /** Number of active threads. */
     unsigned numThreads;
 
@@ -459,14 +506,6 @@ class DefaultIEW
     Stats::Scalar<> iewIQFullEvents;
     /** Stat for number of times the LSQ becomes full. */
     Stats::Scalar<> iewLSQFullEvents;
-    /** Stat for total number of executed instructions. */
-    Stats::Scalar<> iewExecutedInsts;
-    /** Stat for total number of executed load instructions. */
-    Stats::Vector<> iewExecLoadInsts;
-    /** Stat for total number of executed store instructions. */
-//    Stats::Scalar<> iewExecStoreInsts;
-    /** Stat for total number of squashed instructions skipped at execute. */
-    Stats::Scalar<> iewExecSquashedInsts;
     /** Stat for total number of memory ordering violation events. */
     Stats::Scalar<> memOrderViolationEvents;
     /** Stat for total number of incorrect predicted taken branches. */
@@ -476,28 +515,27 @@ class DefaultIEW
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
 
+    /** Stat for total number of executed instructions. */
+    Stats::Scalar<> iewExecutedInsts;
+    /** Stat for total number of executed load instructions. */
+    Stats::Vector<> iewExecLoadInsts;
+    /** Stat for total number of executed store instructions. */
+//    Stats::Scalar<> iewExecStoreInsts;
+    /** Stat for total number of squashed instructions skipped at execute. */
+    Stats::Scalar<> iewExecSquashedInsts;
     /** Number of executed software prefetches. */
-    Stats::Vector<> exeSwp;
+    Stats::Vector<> iewExecutedSwp;
     /** Number of executed nops. */
-    Stats::Vector<> exeNop;
+    Stats::Vector<> iewExecutedNop;
     /** Number of executed meomory references. */
-    Stats::Vector<> exeRefs;
+    Stats::Vector<> iewExecutedRefs;
     /** Number of executed branches. */
-    Stats::Vector<> exeBranches;
-
-//    Stats::Vector<> issued_ops;
-/*
-    Stats::Vector<> stat_fu_busy;
-    Stats::Vector2d<> stat_fuBusy;
-    Stats::Vector<> dist_unissued;
-    Stats::Vector2d<> stat_issued_inst_type;
-*/
-    /** Number of instructions issued per cycle. */
-    Stats::Formula issueRate;
+    Stats::Vector<> iewExecutedBranches;
     /** Number of executed store instructions. */
     Stats::Formula iewExecStoreInsts;
-//    Stats::Formula issue_op_rate;
-//    Stats::Formula fu_busy_rate;
+    /** Number of instructions executed per cycle. */
+    Stats::Formula iewExecRate;
+
     /** Number of instructions sent to commit. */
     Stats::Vector<> iewInstsToCommit;
     /** Number of instructions that writeback. */
@@ -510,7 +548,6 @@ class DefaultIEW
      * to resource contention.
      */
     Stats::Vector<> wbPenalized;
-
     /** Number of instructions per cycle written back. */
     Stats::Formula wbRate;
     /** Average number of woken instructions per writeback. */
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 3ed20cb75..102be4f8d 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -56,9 +56,11 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
     if (iewStage->isSwitchedOut()) {
+        iewStage->decrWb(inst->seqNum);
         inst = NULL;
         return;
     } else if (inst->isSquashed()) {
+        iewStage->decrWb(inst->seqNum);
         iewStage->wakeCPU();
         inst = NULL;
         return;
@@ -93,16 +95,17 @@ DefaultIEW<Impl>::LdWritebackEvent::description()
 template<class Impl>
 DefaultIEW<Impl>::DefaultIEW(Params *params)
     : // @todo: Make this into a parameter.
-      issueToExecQueue(5, 5),
+      issueToExecQueue(params->backComSize, params->forwardComSize),
       instQueue(params),
       ldstQueue(params),
       fuPool(params->fuPool),
       commitToIEWDelay(params->commitToIEWDelay),
       renameToIEWDelay(params->renameToIEWDelay),
       issueToExecuteDelay(params->issueToExecuteDelay),
-      issueReadWidth(params->issueWidth),
+      dispatchWidth(params->dispatchWidth),
       issueWidth(params->issueWidth),
-      executeWidth(params->executeWidth),
+      wbOutstanding(0),
+      wbWidth(params->wbWidth),
       numThreads(params->numberOfThreads),
       switchedOut(false)
 {
@@ -125,8 +128,12 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
         fetchRedirect[i] = false;
     }
 
+    wbMax = wbWidth * params->wbDepth;
+
     updateLSQNextCycle = false;
 
+    ableToIssue = true;
+
     skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
 }
 
@@ -144,6 +151,7 @@ DefaultIEW<Impl>::regStats()
     using namespace Stats;
 
     instQueue.regStats();
+    ldstQueue.regStats();
 
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
@@ -189,20 +197,6 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewLSQFullEvents")
         .desc("Number of times the LSQ has become full, causing a stall");
 
-    iewExecutedInsts
-        .name(name() + ".iewExecutedInsts")
-        .desc("Number of executed instructions");
-
-    iewExecLoadInsts
-        .init(cpu->number_of_threads)
-        .name(name() + ".iewExecLoadInsts")
-        .desc("Number of load instructions executed")
-        .flags(total);
-
-    iewExecSquashedInsts
-        .name(name() + ".iewExecSquashedInsts")
-        .desc("Number of squashed instructions skipped in execute");
-
     memOrderViolationEvents
         .name(name() + ".memOrderViolationEvents")
         .desc("Number of memory order violations");
@@ -221,47 +215,49 @@ DefaultIEW<Impl>::regStats()
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 
-    exeSwp
+    iewExecutedInsts
+        .name(name() + ".iewExecutedInsts")
+        .desc("Number of executed instructions");
+
+    iewExecLoadInsts
+        .init(cpu->number_of_threads)
+        .name(name() + ".iewExecLoadInsts")
+        .desc("Number of load instructions executed")
+        .flags(total);
+
+    iewExecSquashedInsts
+        .name(name() + ".iewExecSquashedInsts")
+        .desc("Number of squashed instructions skipped in execute");
+
+    iewExecutedSwp
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:swp")
         .desc("number of swp insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeNop
+    iewExecutedNop
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:nop")
         .desc("number of nop insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeRefs
+    iewExecutedRefs
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:refs")
         .desc("number of memory reference insts executed")
-        .flags(total)
-        ;
+        .flags(total);
 
-    exeBranches
+    iewExecutedBranches
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:branches")
         .desc("Number of branches executed")
-        .flags(total)
-        ;
-
-    issueRate
-        .name(name() + ".EXEC:rate")
-        .desc("Inst execution rate")
-        .flags(total)
-        ;
-    issueRate = iewExecutedInsts / cpu->numCycles;
+        .flags(total);
 
     iewExecStoreInsts
         .name(name() + ".EXEC:stores")
         .desc("Number of stores executed")
-        .flags(total)
-        ;
-    iewExecStoreInsts = exeRefs - iewExecLoadInsts;
+        .flags(total);
+    iewExecStoreInsts = iewExecutedRefs - iewExecLoadInsts;
 /*
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
@@ -277,58 +273,50 @@ DefaultIEW<Impl>::regStats()
         .init(cpu->number_of_threads)
         .name(name() + ".WB:sent")
         .desc("cumulative count of insts sent to commit")
-        .flags(total)
-        ;
+        .flags(total);
 
     writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
-        .flags(total)
-        ;
+        .flags(total);
 
     producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
-        .flags(total)
-        ;
+        .flags(total);
 
     consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbPenalizedRate = wbPenalized / writebackCount;
 
     wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
-        .flags(total)
-        ;
+        .flags(total);
 
     wbFanout = producerInst / consumerInst;
 
     wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
-        .flags(total)
-        ;
+        .flags(total);
     wbRate = writebackCount / cpu->numCycles;
 }
 
@@ -481,8 +469,7 @@ DefaultIEW<Impl>::takeOverFrom()
 
     updateLSQNextCycle = false;
 
-    // @todo: Fix hardcoded number
-    for (int i = 0; i < 6; ++i) {
+    for (int i = 0; i < issueToExecQueue.getSize(); ++i) {
         issueToExecQueue.advance();
     }
 }
@@ -515,16 +502,7 @@ DefaultIEW<Impl>::squash(unsigned tid)
         skidBuffer[tid].pop();
     }
 
-    while (!insts[tid].empty()) {
-        if (insts[tid].front()->isLoad() ||
-            insts[tid].front()->isStore() ) {
-            toRename->iewInfo[tid].dispatchedToLSQ++;
-        }
-
-        toRename->iewInfo[tid].dispatched++;
-
-        insts[tid].pop();
-    }
+    emptyRenameInsts(tid);
 }
 
 template<class Impl>
@@ -650,14 +628,16 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
     // free slot.
     while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
         ++wbNumInst;
-        if (wbNumInst == issueWidth) {
+        if (wbNumInst == wbWidth) {
             ++wbCycle;
             wbNumInst = 0;
         }
 
-        assert(wbCycle < 5);
+        assert((wbCycle * wbWidth + wbNumInst) <= wbMax);
     }
 
+    DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n",
+            wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst);
     // Add finished instruction to queue to commit.
     (*iewQueue)[wbCycle].insts[wbNumInst] = inst;
     (*iewQueue)[wbCycle].size++;
@@ -670,7 +650,7 @@ DefaultIEW<Impl>::validInstsFromRename()
     unsigned inst_count = 0;
 
     for (int i=0; i<fromRename->size; i++) {
-        if (!fromRename->insts[i]->squashed)
+        if (!fromRename->insts[i]->isSquashed())
             inst_count++;
     }
 
@@ -858,10 +838,12 @@ DefaultIEW<Impl>::checkSignalsAndUpdate(unsigned tid)
     }
 
     if (fromCommit->commitInfo[tid].robSquashing) {
-        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n");
+        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n", tid);
 
         dispatchStatus[tid] = Squashing;
 
+        emptyRenameInsts(tid);
+        wroteToTimeBuffer = true;
         return;
     }
 
@@ -912,6 +894,22 @@ DefaultIEW<Impl>::sortInsts()
 
 template <class Impl>
 void
+DefaultIEW<Impl>::emptyRenameInsts(unsigned tid)
+{
+    while (!insts[tid].empty()) {
+        if (insts[tid].front()->isLoad() ||
+            insts[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
+
+        toRename->iewInfo[tid].dispatched++;
+
+        insts[tid].pop();
+    }
+}
+
+template <class Impl>
+void
 DefaultIEW<Impl>::wakeCPU()
 {
     cpu->wakeCPU();
@@ -1010,7 +1008,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
     // Loop through the instructions, putting them in the instruction
     // queue.
     for ( ; dis_num_inst < insts_to_add &&
-              dis_num_inst < issueReadWidth;
+              dis_num_inst < dispatchWidth;
           ++dis_num_inst)
     {
         inst = insts_to_dispatch.front();
@@ -1149,7 +1147,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             instQueue.recordProducer(inst);
 
-            exeNop[tid]++;
+            iewExecutedNop[tid]++;
 
             add_to_iq = false;
         } else if (inst->isExecuted()) {
@@ -1263,6 +1261,7 @@ DefaultIEW<Impl>::executeInsts()
 
             ++iewExecSquashedInsts;
 
+            decrWb(inst->seqNum);
             continue;
         }
 
@@ -1399,8 +1398,8 @@ DefaultIEW<Impl>::writebackInsts()
         DynInstPtr inst = toCommit->insts[inst_num];
         int tid = inst->threadNumber;
 
-        DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
-                inst->readPC());
+        DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %#x.\n",
+                inst->seqNum, inst->readPC());
 
         iewInstsToCommit[tid]++;
 
@@ -1425,6 +1424,8 @@ DefaultIEW<Impl>::writebackInsts()
             }
             writebackCount[tid]++;
         }
+
+        decrWb(inst->seqNum);
     }
 }
 
@@ -1561,7 +1562,7 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exeSwp[thread_number]++;
+        iewExecutedSwp[thread_number]++;
     else
         iewExecutedInsts++;
 #else
@@ -1572,13 +1573,13 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //  Control operations
     //
     if (inst->isControl())
-        exeBranches[thread_number]++;
+        iewExecutedBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exeRefs[thread_number]++;
+        iewExecutedRefs[thread_number]++;
 
         if (inst->isLoad()) {
             iewExecLoadInsts[thread_number]++;
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 4802cbaf4..80cd71f0d 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -490,8 +490,6 @@ class InstructionQueue
 
     /** Number of instructions issued per cycle. */
     Stats::Formula issueRate;
-//    Stats::Formula issue_stores;
-//    Stats::Formula issue_op_rate;
     /** Number of times the FU was busy. */
     Stats::Vector<> fuBusy;
     /** Number of times the FU was busy per instruction issued. */
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index d677a259c..72cb0d708 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -288,22 +288,7 @@ InstructionQueue<Impl>::regStats()
         .flags(total)
         ;
     issueRate = iqInstsIssued / cpu->numCycles;
-/*
-    issue_stores
-        .name(name() + ".ISSUE:stores")
-        .desc("Number of stores issued")
-        .flags(total)
-        ;
-    issue_stores = exe_refs - exe_loads;
-*/
-/*
-    issue_op_rate
-        .name(name() + ".ISSUE:op_rate")
-        .desc("Operation issue rate")
-        .flags(total)
-        ;
-    issue_op_rate = issued_ops / numCycles;
-*/
+
     statFuBusy
         .init(Num_OpClasses)
         .name(name() + ".ISSUE:fu_full")
@@ -700,6 +685,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     int total_issued = 0;
 
     while (total_issued < totalWidth &&
+           iewStage->canIssue() &&
            order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
@@ -790,13 +776,14 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // complete.
                 ++freeEntries;
                 count[tid]--;
-                issuing_inst->removeInIQ();
+                issuing_inst->clearInIQ();
             } else {
                 memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
             statIssuedInstType[tid][op_class]++;
+            iewStage->incrWb(issuing_inst->seqNum);
         } else {
             statFuBusy[op_class]++;
             fuBusy[tid]++;
@@ -1096,7 +1083,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
             // inst will flow through the rest of the pipeline.
             squashed_inst->setIssued();
             squashed_inst->setCanCommit();
-            squashed_inst->removeInIQ();
+            squashed_inst->clearInIQ();
 
             //Update Thread IQ Count
             count[squashed_inst->threadNumber]--;
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index b321d4590..c67225bc0 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -62,6 +62,9 @@ class LSQ {
     /** Returns the name of the LSQ. */
     std::string name() const;
 
+    /** Registers the statistics for each LSQ Unit. */
+    void regStats();
+
     /** Sets the pointer to the list of active threads. */
     void setActiveThreads(std::list<unsigned> *at_ptr);
     /** Sets the CPU pointer. */
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index a6ad27522..a8a55af1a 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -106,6 +106,16 @@ LSQ<Impl>::name() const
 
 template<class Impl>
 void
+LSQ<Impl>::regStats()
+{
+    //Initialize LSQs
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].regStats();
+    }
+}
+
+template<class Impl>
+void
 LSQ<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
     activeThreads = at_ptr;
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index a6afff743..fe174a97d 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -101,6 +101,9 @@ class LSQUnit {
     /** Returns the name of the LSQ unit. */
     std::string name() const;
 
+    /** Registers statistics. */
+    void regStats();
+
     /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr)
     { cpu = cpu_ptr; }
@@ -153,9 +156,6 @@ class LSQUnit {
     /** Writes back stores. */
     void writebackStores();
 
-    // @todo: Include stats in the LSQ unit.
-    //void regStats();
-
     /** Clears all the entries in the LQ. */
     void clearLQ();
 
@@ -369,25 +369,34 @@ class LSQUnit {
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
-/*
-    // total number of loads forwaded from LSQ stores
-    Stats::Vector<> lsq_forw_loads;
+    /** Total number of loads forwaded from LSQ stores. */
+    Stats::Scalar<> lsqForwLoads;
+
+    /** Total number of loads ignored due to invalid addresses. */
+    Stats::Scalar<> invAddrLoads;
+
+    /** Total number of squashed loads. */
+    Stats::Scalar<> lsqSquashedLoads;
 
-    // total number of loads ignored due to invalid addresses
-    Stats::Vector<> inv_addr_loads;
+    /** Total number of responses from the memory system that are
+     * ignored due to the instruction already being squashed. */
+    Stats::Scalar<> lsqIgnoredResponses;
 
-    // total number of software prefetches ignored due to invalid addresses
-    Stats::Vector<> inv_addr_swpfs;
+    /** Total number of squashed stores. */
+    Stats::Scalar<> lsqSquashedStores;
 
-    // total non-speculative bogus addresses seen (debug var)
-    Counter sim_invalid_addrs;
-    Stats::Vector<> fu_busy;  //cumulative fu busy
+    /** Total number of software prefetches ignored due to invalid addresses. */
+    Stats::Scalar<> invAddrSwpfs;
 
-    // ready loads blocked due to memory disambiguation
-    Stats::Vector<> lsq_blocked_loads;
+    /** Ready loads blocked due to partial store-forwarding. */
+    Stats::Scalar<> lsqBlockedLoads;
+
+    /** Number of loads that were rescheduled. */
+    Stats::Scalar<> lsqRescheduledLoads;
+
+    /** Number of times the LSQ is blocked due to the cache. */
+    Stats::Scalar<> lsqCacheBlocked;
 
-    Stats::Scalar<> lsqInversion;
-*/
   public:
     /** Executes the load at the given index. */
     template <class T>
@@ -441,8 +450,9 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
     if (req->flags & UNCACHEABLE &&
-        (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
+        (load_idx != loadHead || !loadQueue[load_idx]->isAtCommit())) {
         iewStage->rescheduleMemInst(loadQueue[load_idx]);
+        ++lsqRescheduledLoads;
         return TheISA::genMachineCheckFault();
     }
 
@@ -552,6 +562,8 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             // Tell IQ/mem dep unit that this instruction will need to be
             // rescheduled eventually
             iewStage->rescheduleMemInst(loadQueue[load_idx]);
+            iewStage->decrWb(loadQueue[load_idx]->seqNum);
+            ++lsqRescheduledLoads;
 
             // Do not generate a writeback event as this instruction is not
             // complete.
@@ -559,6 +571,7 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
                     "Store idx %i to load addr %#x\n",
                     store_idx, req->vaddr);
 
+            ++lsqBlockedLoads;
             return NoFault;
         }
     }
@@ -579,6 +592,10 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // if we have a cache, do cache access too
     if (fault == NoFault && dcacheInterface) {
         if (dcacheInterface->isBlocked()) {
+            ++lsqCacheBlocked;
+
+            iewStage->decrWb(inst->seqNum);
+
             // There's an older load that's already going to squash.
             if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
                 return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index 4ee8bb234..5cc3078f8 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -126,6 +126,47 @@ LSQUnit<Impl>::name() const
 
 template<class Impl>
 void
+LSQUnit<Impl>::regStats()
+{
+    lsqForwLoads
+        .name(name() + ".forwLoads")
+        .desc("Number of loads that had data forwarded from stores");
+
+    invAddrLoads
+        .name(name() + ".invAddrLoads")
+        .desc("Number of loads ignored due to an invalid address");
+
+    lsqSquashedLoads
+        .name(name() + ".squashedLoads")
+        .desc("Number of loads squashed");
+
+    lsqIgnoredResponses
+        .name(name() + ".ignoredResponses")
+        .desc("Number of memory responses ignored because the instruction is squashed");
+
+    lsqSquashedStores
+        .name(name() + ".squashedStores")
+        .desc("Number of stores squashed");
+
+    invAddrSwpfs
+        .name(name() + ".invAddrSwpfs")
+        .desc("Number of software prefetches ignored due to an invalid address");
+
+    lsqBlockedLoads
+        .name(name() + ".blockedLoads")
+        .desc("Number of blocked loads due to partial load-store forwarding");
+
+    lsqRescheduledLoads
+        .name(name() + ".rescheduledLoads")
+        .desc("Number of loads that were rescheduled");
+
+    lsqCacheBlocked
+        .name(name() + ".cacheBlocked")
+        .desc("Number of times an access to memory failed due to the cache being blocked");
+}
+
+template<class Impl>
+void
 LSQUnit<Impl>::clearLQ()
 {
     loadQueue.clear();
@@ -548,6 +589,7 @@ LSQUnit<Impl>::writebackStores()
         if (dcacheInterface && dcacheInterface->isBlocked()) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
                     " is blocked!\n");
+            ++lsqCacheBlocked;
             break;
         }
 
@@ -705,7 +747,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        loadQueue[load_idx]->squashed = true;
+        loadQueue[load_idx]->setSquashed();
         loadQueue[load_idx] = NULL;
         --loads;
 
@@ -748,7 +790,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         }
 
         // Clear the smart pointer to make sure it is decremented.
-        storeQueue[store_idx].inst->squashed = true;
+        storeQueue[store_idx].inst->setSquashed();
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
@@ -765,6 +807,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
         storeTail = store_idx;
 
         decrStIdx(store_idx);
+        ++lsqSquashedStores;
     }
 }
 
diff --git a/cpu/o3/mem_dep_unit.cc b/cpu/o3/mem_dep_unit.cc
index ccdd1a515..b0f91d44f 100644
--- a/cpu/o3/mem_dep_unit.cc
+++ b/cpu/o3/mem_dep_unit.cc
@@ -35,6 +35,7 @@
 // AlphaSimpleImpl.
 template class MemDepUnit<StoreSet, AlphaSimpleImpl>;
 
+#ifdef DEBUG
 template <>
 int
 MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_count = 0;
@@ -44,3 +45,4 @@ MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_insert = 0;
 template <>
 int
 MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_erase = 0;
+#endif
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 595e9293f..bfe694bd8 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -59,7 +59,9 @@ MemDepUnit<MemDepPred, Impl>::~MemDepUnit()
         }
     }
 
+#ifdef DEBUG
     assert(MemDepEntry::memdep_count == 0);
+#endif
 }
 
 template <class MemDepPred, class Impl>
@@ -141,7 +143,9 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
     // Add the MemDepEntry to the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     instList[tid].push_back(inst);
 
@@ -227,7 +231,9 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
     // Insert the MemDepEntry into the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     // Add the instruction to the list.
     instList[tid].push_back(inst);
@@ -275,7 +281,9 @@ MemDepUnit<MemDepPred, Impl>::insertBarrier(DynInstPtr &barr_inst)
     // Add the MemDepEntry to the hash.
     memDepHash.insert(
         std::pair<InstSeqNum, MemDepEntryPtr>(barr_sn, inst_entry));
+#ifdef DEBUG
     MemDepEntry::memdep_insert++;
+#endif
 
     // Add the instruction to the instruction list.
     instList[tid].push_back(barr_inst);
@@ -375,7 +383,9 @@ MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
     (*hash_it).second = NULL;
 
     memDepHash.erase(hash_it);
+#ifdef DEBUG
     MemDepEntry::memdep_erase++;
+#endif
 }
 
 template <class MemDepPred, class Impl>
@@ -470,7 +480,9 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
         (*hash_it).second = NULL;
 
         memDepHash.erase(hash_it);
+#ifdef DEBUG
         MemDepEntry::memdep_erase++;
+#endif
 
         instList[tid].erase(squash_it--);
     }
@@ -551,5 +563,7 @@ MemDepUnit<MemDepPred, Impl>::dumpLists()
 
     cprintf("Memory dependence hash size: %i\n", memDepHash.size());
 
+#ifdef DEBUG
     cprintf("Memory dependence entries: %i\n", MemDepEntry::memdep_count);
+#endif
 }
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index ed1238d36..76c43d3a1 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -223,10 +223,10 @@ class PhysRegFile
 
   public:
     /** (signed) integer register file. */
-    std::vector<IntReg> intRegFile;
+    IntReg *intRegFile;
 
     /** Floating point register file. */
-    std::vector<FloatReg> floatRegFile;
+    FloatReg *floatRegFile;
 
     /** Miscellaneous register file. */
     MiscRegFile miscRegs[Impl::MaxThreads];
@@ -256,11 +256,15 @@ PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
     : numPhysicalIntRegs(_numPhysicalIntRegs),
       numPhysicalFloatRegs(_numPhysicalFloatRegs)
 {
-    intRegFile.resize(numPhysicalIntRegs);
-    floatRegFile.resize(numPhysicalFloatRegs);
+    intRegFile = new IntReg[numPhysicalIntRegs];
+    floatRegFile = new FloatReg[numPhysicalFloatRegs];
 
-    //memset(intRegFile, 0, sizeof(*intRegFile));
-    //memset(floatRegFile, 0, sizeof(*floatRegFile));
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        miscRegs[i].clear();
+    }
+
+    memset(intRegFile, 0, sizeof(*intRegFile));
+    memset(floatRegFile, 0, sizeof(*floatRegFile));
 }
 
 #endif
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 829c99584..93f5b3504 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -348,7 +348,7 @@ DefaultRename<Impl>::squash(unsigned tid)
 
     for (int i=0; i<fromDecode->size; i++) {
         if (fromDecode->insts[i]->threadNumber == tid) {
-            fromDecode->insts[i]->squashed = true;
+            fromDecode->insts[i]->setSquashed();
             wroteToTimeBuffer = true;
             squashCount++;
         }
@@ -1029,7 +1029,7 @@ DefaultRename<Impl>::validInsts()
     unsigned inst_count = 0;
 
     for (int i=0; i<fromDecode->size; i++) {
-        if (!fromDecode->insts[i]->squashed)
+        if (!fromDecode->insts[i]->isSquashed())
             inst_count++;
     }
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index bdbdde32f..2043e0b34 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -305,7 +305,7 @@ class ROB
 
   private:
     /** The sequence number of the squashed instruction. */
-    InstSeqNum squashedSeqNum;
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
     /** Is the ROB done squashing. */
     bool doneSquashing[Impl::MaxThreads];
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 25e0c80fd..62c4d9cf7 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -38,10 +38,10 @@ ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth,
     : numEntries(_numEntries),
       squashWidth(_squashWidth),
       numInstsInROB(0),
-      squashedSeqNum(0),
       numThreads(_numThreads)
 {
     for (int tid=0; tid  < numThreads; tid++) {
+        squashedSeqNum[tid] = 0;
         doneSquashing[tid] = true;
         threadEntries[tid] = 0;
     }
@@ -274,7 +274,7 @@ ROB<Impl>::retireHead(unsigned tid)
     --numInstsInROB;
     --threadEntries[tid];
 
-    head_inst->removeInROB();
+    head_inst->clearInROB();
     head_inst->setCommitted();
 
     instList[tid].erase(head_it);
@@ -349,11 +349,11 @@ void
 ROB<Impl>::doSquash(unsigned tid)
 {
     DPRINTF(ROB, "[tid:%u]: Squashing instructions until [sn:%i].\n",
-            tid, squashedSeqNum);
+            tid, squashedSeqNum[tid]);
 
     assert(squashIt[tid] != instList[tid].end());
 
-    if ((*squashIt[tid])->seqNum < squashedSeqNum) {
+    if ((*squashIt[tid])->seqNum < squashedSeqNum[tid]) {
         DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
                 tid);
 
@@ -368,7 +368,7 @@ ROB<Impl>::doSquash(unsigned tid)
     for (int numSquashed = 0;
          numSquashed < squashWidth &&
          squashIt[tid] != instList[tid].end() &&
-         (*squashIt[tid])->seqNum > squashedSeqNum;
+         (*squashIt[tid])->seqNum > squashedSeqNum[tid];
          ++numSquashed)
     {
         DPRINTF(ROB, "[tid:%u]: Squashing instruction PC %#x, seq num %i.\n",
@@ -405,7 +405,7 @@ ROB<Impl>::doSquash(unsigned tid)
 
 
     // Check if ROB is done squashing.
-    if ((*squashIt[tid])->seqNum <= squashedSeqNum) {
+    if ((*squashIt[tid])->seqNum <= squashedSeqNum[tid]) {
         DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
                 tid);
 
@@ -517,7 +517,7 @@ ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
 
     doneSquashing[tid] = false;
 
-    squashedSeqNum = squash_num;
+    squashedSeqNum[tid] = squash_num;
 
     if (!instList[tid].empty()) {
         InstIt tail_thread = instList[tid].end();