From a8b03e4d017b66d7b5502a101ea5b7115827a107 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Sat, 22 Apr 2006 18:26:48 -0400
Subject: Updates for O3 model.

arch/alpha/isa/decoder.isa:
    Make IPR accessing instructions serializing so they are not issued incorrectly in the O3 model.
arch/alpha/isa/pal.isa:
    Allow IPR instructions to have flags.
base/traceflags.py:
    Include new trace flags from the two new CPU models.
cpu/SConscript:
    Create the templates for the split mem accessor methods.  Also include the new files from the new models (the Ozone model will be checked in next).
cpu/base_dyn_inst.cc:
cpu/base_dyn_inst.hh:
    Update to the BaseDynInst for the new models.

--HG--
extra : convert_revision : cc82db9c72ec3e29cea4c3fdff74a3843e287a35
---
 cpu/o3/2bit_local_pred.cc     |   21 +-
 cpu/o3/2bit_local_pred.hh     |   21 +-
 cpu/o3/alpha_cpu.hh           |  302 ++++++---
 cpu/o3/alpha_cpu_builder.cc   |  280 +++++----
 cpu/o3/alpha_cpu_impl.hh      |  682 ++++++++++++++++-----
 cpu/o3/alpha_dyn_inst.hh      |   70 ++-
 cpu/o3/alpha_dyn_inst_impl.hh |  102 +++-
 cpu/o3/alpha_impl.hh          |   17 +-
 cpu/o3/alpha_params.hh        |   58 +-
 cpu/o3/bpred_unit.cc          |    4 +
 cpu/o3/bpred_unit.hh          |  150 ++++-
 cpu/o3/bpred_unit_impl.hh     |  179 +++---
 cpu/o3/btb.cc                 |   26 +-
 cpu/o3/btb.hh                 |   63 +-
 cpu/o3/comm.hh                |   94 ++-
 cpu/o3/commit.cc              |    2 +-
 cpu/o3/commit.hh              |  283 ++++++++-
 cpu/o3/commit_impl.hh         | 1118 ++++++++++++++++++++++++++++-----
 cpu/o3/cpu.cc                 |  955 ++++++++++++++++++++++++-----
 cpu/o3/cpu.hh                 |  360 ++++++++---
 cpu/o3/cpu_policy.hh          |   33 +-
 cpu/o3/decode.cc              |    2 +-
 cpu/o3/decode.hh              |  168 ++++-
 cpu/o3/decode_impl.hh         |  659 ++++++++++++++------
 cpu/o3/fetch.cc               |    2 +-
 cpu/o3/fetch.hh               |  236 ++++++-
 cpu/o3/fetch_impl.hh          | 1053 ++++++++++++++++++++++++-------
 cpu/o3/free_list.cc           |   36 +-
 cpu/o3/free_list.hh           |   82 ++-
 cpu/o3/fu_pool.cc             |  281 +++++++++
 cpu/o3/fu_pool.hh             |  159 +++++
 cpu/o3/iew.cc                 |    2 +-
 cpu/o3/iew.hh                 |  320 ++++++++--
 cpu/o3/iew_impl.hh            | 1360 ++++++++++++++++++++++++++++++-----------
 cpu/o3/inst_queue.hh          |  350 ++++++++---
 cpu/o3/inst_queue_impl.hh     | 1156 +++++++++++++++++++++--------------
 cpu/o3/lsq.cc                 |   36 ++
 cpu/o3/lsq.hh                 |  307 ++++++++++
 cpu/o3/lsq_impl.hh            |  645 +++++++++++++++++++
 cpu/o3/lsq_unit.cc            |   36 ++
 cpu/o3/lsq_unit.hh            |  703 +++++++++++++++++++++
 cpu/o3/lsq_unit_impl.hh       |  893 +++++++++++++++++++++++++++
 cpu/o3/mem_dep_unit.cc        |   10 +
 cpu/o3/mem_dep_unit.hh        |  226 ++++---
 cpu/o3/mem_dep_unit_impl.hh   |  536 +++++++++-------
 cpu/o3/ras.cc                 |   18 +-
 cpu/o3/ras.hh                 |   35 +-
 cpu/o3/regfile.hh             |  119 ++--
 cpu/o3/rename.cc              |    2 +-
 cpu/o3/rename.hh              |  340 +++++++++--
 cpu/o3/rename_impl.hh         | 1332 +++++++++++++++++++++++++++-------------
 cpu/o3/rename_map.cc          |  210 +++----
 cpu/o3/rename_map.hh          |   61 +-
 cpu/o3/rob.hh                 |  214 ++++++-
 cpu/o3/rob_impl.hh            |  588 ++++++++++++++----
 cpu/o3/sat_counter.cc         |    6 +-
 cpu/o3/sat_counter.hh         |   10 +-
 cpu/o3/scoreboard.cc          |  105 ++++
 cpu/o3/scoreboard.hh          |  114 ++++
 cpu/o3/store_set.cc           |  149 +++--
 cpu/o3/store_set.hh           |   47 +-
 cpu/o3/thread_state.hh        |  143 +++++
 cpu/o3/tournament_pred.cc     |   47 +-
 cpu/o3/tournament_pred.hh     |   33 +-
 64 files changed, 13830 insertions(+), 3821 deletions(-)
 create mode 100644 cpu/o3/fu_pool.cc
 create mode 100644 cpu/o3/fu_pool.hh
 create mode 100644 cpu/o3/lsq.cc
 create mode 100644 cpu/o3/lsq.hh
 create mode 100644 cpu/o3/lsq_impl.hh
 create mode 100644 cpu/o3/lsq_unit.cc
 create mode 100644 cpu/o3/lsq_unit.hh
 create mode 100644 cpu/o3/lsq_unit_impl.hh
 create mode 100644 cpu/o3/scoreboard.cc
 create mode 100644 cpu/o3/scoreboard.hh
 create mode 100644 cpu/o3/thread_state.hh

(limited to 'cpu/o3')

diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index d9744eec7..458fbd663 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/intmath.hh"
 #include "base/trace.hh"
 #include "cpu/o3/2bit_local_pred.hh"
 
@@ -36,17 +37,25 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize,
       localCtrBits(_localCtrBits),
       instShiftAmt(_instShiftAmt)
 {
-    // Should do checks here to make sure sizes are correct (powers of 2).
+    if (!isPowerOf2(localPredictorSize)) {
+        fatal("Invalid local predictor size!\n");
+    }
+
+    localPredictorSets = localPredictorSize / localCtrBits;
+
+    if (!isPowerOf2(localPredictorSets)) {
+        fatal("Invalid number of local predictor sets! Check localCtrBits.\n");
+    }
 
     // Setup the index mask.
-    indexMask = localPredictorSize - 1;
+    indexMask = localPredictorSets - 1;
 
     DPRINTF(Fetch, "Branch predictor: index mask: %#x\n", indexMask);
 
     // Setup the array of counters for the local predictor.
-    localCtrs = new SatCounter[localPredictorSize];
+    localCtrs.resize(localPredictorSets);
 
-    for (int i = 0; i < localPredictorSize; ++i)
+    for (int i = 0; i < localPredictorSets; ++i)
         localCtrs[i].setBits(_localCtrBits);
 
     DPRINTF(Fetch, "Branch predictor: local predictor size: %i\n",
@@ -68,8 +77,6 @@ DefaultBP::lookup(Addr &branch_addr)
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
-    assert(local_predictor_idx < localPredictorSize);
-
     local_prediction = localCtrs[local_predictor_idx].read();
 
     DPRINTF(Fetch, "Branch predictor: prediction is %i.\n",
@@ -102,8 +109,6 @@ DefaultBP::update(Addr &branch_addr, bool taken)
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
-    assert(local_predictor_idx < localPredictorSize);
-
     if (taken) {
         DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
         localCtrs[local_predictor_idx].increment();
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 97433e542..38d3f4842 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -26,18 +26,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
-#define __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
+#ifndef __CPU_O3_2BIT_LOCAL_PRED_HH__
+#define __CPU_O3_2BIT_LOCAL_PRED_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
 #include "cpu/o3/sat_counter.hh"
 
+#include <vector>
+
 class DefaultBP
 {
   public:
     /**
      * Default branch predictor constructor.
+     * @param localPredictorSize Size of the local predictor.
+     * @param localCtrBits Number of bits per counter.
+     * @param instShiftAmt Offset amount for instructions to ignore alignment.
      */
     DefaultBP(unsigned localPredictorSize, unsigned localCtrBits,
               unsigned instShiftAmt);
@@ -59,8 +64,11 @@ class DefaultBP
 
   private:
 
-    /** Returns the taken/not taken prediction given the value of the
+    /**
+     *  Returns the taken/not taken prediction given the value of the
      *  counter.
+     *  @param count The value of the counter.
+     *  @return The prediction based on the counter value.
      */
     inline bool getPrediction(uint8_t &count);
 
@@ -68,11 +76,14 @@ class DefaultBP
     inline unsigned getLocalIndex(Addr &PC);
 
     /** Array of counters that make up the local predictor. */
-    SatCounter *localCtrs;
+    std::vector<SatCounter> localCtrs;
 
     /** Size of the local predictor. */
     unsigned localPredictorSize;
 
+    /** Number of sets. */
+    unsigned localPredictorSets;
+
     /** Number of bits of the local predictor's counters. */
     unsigned localCtrBits;
 
@@ -83,4 +94,4 @@ class DefaultBP
     unsigned indexMask;
 };
 
-#endif // __CPU_O3_CPU_2BIT_LOCAL_PRED_HH__
+#endif // __CPU_O3_2BIT_LOCAL_PRED_HH__
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 0352e9972..68e149e77 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -26,14 +26,12 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Find all the stuff in ExecContext and ev5 that needs to be
-// specifically designed for this CPU.
+#ifndef __CPU_O3_ALPHA_FULL_CPU_HH__
+#define __CPU_O3_ALPHA_FULL_CPU_HH__
 
-#ifndef __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
-#define __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
-
-#include "cpu/o3/cpu.hh"
 #include "arch/isa_traits.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/o3/cpu.hh"
 #include "sim/byteswap.hh"
 
 template <class Impl>
@@ -46,17 +44,175 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     typedef TheISA::MiscRegFile MiscRegFile;
 
   public:
+    typedef O3ThreadState<Impl> ImplState;
+    typedef O3ThreadState<Impl> Thread;
     typedef typename Impl::Params Params;
 
-  public:
-    AlphaFullCPU(Params &params);
+    /** Constructs an AlphaFullCPU with the given parameters. */
+    AlphaFullCPU(Params *params);
+
+    class AlphaXC : public ExecContext
+    {
+      public:
+        AlphaFullCPU<Impl> *cpu;
+
+        O3ThreadState<Impl> *thread;
+
+        Tick lastActivate;
+        Tick lastSuspend;
+
+        Event *quiesceEvent;
+
+        virtual BaseCPU *getCpuPtr() { return cpu; }
+
+        virtual void setCpuId(int id) { cpu->cpu_id = id; }
+
+        virtual int readCpuId() { return cpu->cpu_id; }
+
+        virtual FunctionalMemory *getMemPtr() { return thread->mem; }
+
+#if FULL_SYSTEM
+        virtual System *getSystemPtr() { return cpu->system; }
+
+        virtual PhysicalMemory *getPhysMemPtr() { return cpu->physmem; }
+
+        virtual AlphaITB *getITBPtr() { return cpu->itb; }
+
+        virtual AlphaDTB * getDTBPtr() { return cpu->dtb; }
+#else
+        virtual Process *getProcessPtr() { return thread->process; }
+#endif
+
+        virtual Status status() const { return thread->status(); }
+
+        virtual void setStatus(Status new_status) { thread->setStatus(new_status); }
+
+        /// Set the status to Active.  Optional delay indicates number of
+        /// cycles to wait before beginning execution.
+        virtual void activate(int delay = 1);
+
+        /// Set the status to Suspended.
+        virtual void suspend();
+
+        /// Set the status to Unallocated.
+        virtual void deallocate();
+
+        /// Set the status to Halted.
+        virtual void halt();
 
 #if FULL_SYSTEM
+        virtual void dumpFuncProfile();
+#endif
+
+        virtual void takeOverFrom(ExecContext *old_context);
+
+        virtual void regStats(const std::string &name);
+
+        virtual void serialize(std::ostream &os);
+        virtual void unserialize(Checkpoint *cp, const std::string &section);
+
+#if FULL_SYSTEM
+        virtual Event *getQuiesceEvent();
+
+        // Not necessarily the best location for these...
+        // Having an extra function just to read these is obnoxious
+        virtual Tick readLastActivate();
+        virtual Tick readLastSuspend();
+
+        virtual void profileClear();
+        virtual void profileSample();
+#endif
+
+        virtual int getThreadNum() { return thread->tid; }
+
+        // Also somewhat obnoxious.  Really only used for the TLB fault.
+        // However, may be quite useful in SPARC.
+        virtual TheISA::MachInst getInst();
+
+        virtual void copyArchRegs(ExecContext *xc);
+
+        virtual void clearArchRegs();
+
+        //
+        // New accessors for new decoder.
+        //
+        virtual uint64_t readIntReg(int reg_idx);
+
+        virtual float readFloatRegSingle(int reg_idx);
+
+        virtual double readFloatRegDouble(int reg_idx);
+
+        virtual uint64_t readFloatRegInt(int reg_idx);
+
+        virtual void setIntReg(int reg_idx, uint64_t val);
+
+        virtual void setFloatRegSingle(int reg_idx, float val);
+
+        virtual void setFloatRegDouble(int reg_idx, double val);
+
+        virtual void setFloatRegInt(int reg_idx, uint64_t val);
+
+        virtual uint64_t readPC()
+        { return cpu->readPC(thread->tid); }
+
+        virtual void setPC(uint64_t val);
+
+        virtual uint64_t readNextPC()
+        { return cpu->readNextPC(thread->tid); }
+
+        virtual void setNextPC(uint64_t val);
+
+        virtual MiscReg readMiscReg(int misc_reg)
+        { return cpu->readMiscReg(misc_reg, thread->tid); }
+
+        virtual MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
+        { return cpu->readMiscRegWithEffect(misc_reg, fault, thread->tid); }
+
+        virtual Fault setMiscReg(int misc_reg, const MiscReg &val);
+
+        virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
+
+        // Also not necessarily the best location for these two.
+        // Hopefully will go away once we decide upon where st cond
+        // failures goes.
+        virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
+
+        virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
+
+#if FULL_SYSTEM
+        virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
+#endif
+
+        // Only really makes sense for old CPU model.  Still could be useful though.
+        virtual bool misspeculating() { return false; }
+
+#if !FULL_SYSTEM
+        virtual IntReg getSyscallArg(int i);
+
+        // used to shift args for indirect syscall
+        virtual void setSyscallArg(int i, IntReg val);
+
+        virtual void setSyscallReturn(SyscallReturn return_value);
+
+        virtual void syscall() { return cpu->syscall(thread->tid); }
+
+        // Same with st cond failures.
+        virtual Counter readFuncExeInst() { return thread->funcExeInst; }
+#endif
+    };
+
+    friend class AlphaXC;
+
+    std::vector<AlphaXC *> xcProxies;
+
+#if FULL_SYSTEM
+    /** ITB pointer. */
     AlphaITB *itb;
+    /** DTB pointer. */
     AlphaDTB *dtb;
 #endif
 
-  public:
+    /** Registers statistics. */
     void regStats();
 
 #if FULL_SYSTEM
@@ -67,16 +223,19 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 //    void clear_interrupt(int int_num, int index);
 //    void clear_interrupts();
 
+    /** Translates instruction requestion. */
     Fault translateInstReq(MemReqPtr &req)
     {
         return itb->translate(req);
     }
 
+    /** Translates data read request. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dtb->translate(req, false);
     }
 
+    /** Translates data write request. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dtb->translate(req, true);
@@ -95,16 +254,19 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         return NoFault;
     }
 
+    /** Translates instruction requestion in syscall emulation mode. */
     Fault translateInstReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
 
+    /** Translates data read request in syscall emulation mode. */
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
 
+    /** Translates data write request in syscall emulation mode. */
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
@@ -113,36 +275,36 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 
     // Later on may want to remove this misc stuff from the regfile and
-    // have it handled at this level.  Might prove to be an issue when
+    // have it handled at this level.  This would be similar to moving certain
+    // IPRs into the devices themselves.  Might prove to be an issue when
     // trying to rename source/destination registers...
-    MiscReg readMiscReg(int misc_reg)
-    {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
-    }
+    MiscReg readMiscReg(int misc_reg, unsigned tid);
 
-    Fault setMiscReg(int misc_reg, const MiscReg &val)
-    {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
-    }
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault, unsigned tid);
+
+    Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned tid);
+
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val, unsigned tid);
+
+    void squashFromXC(unsigned tid);
 
-    // Most of the full system code and syscall emulation is not yet
-    // implemented.  These functions do show what the final interface will
-    // look like.
 #if FULL_SYSTEM
+    void post_interrupt(int int_num, int index);
+
     int readIntrFlag();
+    /** Sets the interrupt flags. */
     void setIntrFlag(int val);
-    Fault hwrei();
-    bool inPalMode() { return AlphaISA::PcPAL(this->regFile.readPC()); }
+    /** HW return from error interrupt. */
+    Fault hwrei(unsigned tid);
+    /** Returns if a specific PC is a PAL mode PC. */
     bool inPalMode(uint64_t PC)
     { return AlphaISA::PcPAL(PC); }
 
-    void trap(Fault fault);
+    /** Traps to handle given fault. */
+    void trap(Fault fault, unsigned tid);
     bool simPalCheck(int palFunc);
 
+    /** Processes any interrupts. */
     void processInterrupts();
 #endif
 
@@ -152,84 +314,64 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     // register.  Actually, these functions should handle most of this
     // functionality by themselves; should look up the rename and then
     // set the register.
-    IntReg getSyscallArg(int i)
-    {
-        return this->cpuXC->readIntReg(AlphaISA::ArgumentReg0 + i);
-    }
+    /** Gets a syscall argument. */
+    IntReg getSyscallArg(int i, int tid);
 
-    // used to shift args for indirect syscall
-    void setSyscallArg(int i, IntReg val)
-    {
-        this->cpuXC->setIntReg(AlphaISA::ArgumentReg0 + i, val);
-    }
+    /** Used to shift args for indirect syscall. */
+    void setSyscallArg(int i, IntReg val, int tid);
 
-    void setSyscallReturn(int64_t return_value)
-    {
-        // check for error condition.  Alpha syscall convention is to
-        // indicate success/failure in reg a3 (r19) and put the
-        // return value itself in the standard return value reg (v0).
-        const int RegA3 = 19;	// only place this is used
-        if (return_value >= 0) {
-            // no error
-            this->cpuXC->setIntReg(RegA3, 0);
-            this->cpuXC->setIntReg(AlphaISA::ReturnValueReg, return_value);
-        } else {
-            // got an error, return details
-            this->cpuXC->setIntReg(RegA3, (IntReg) -1);
-            this->cpuXC->setIntReg(AlphaISA::ReturnValueReg, -return_value);
-        }
-    }
+    /** Sets the return value of a syscall. */
+    void setSyscallReturn(SyscallReturn return_value, int tid);
 
-    void syscall(short thread_num);
-    void squashStages();
+    /** Executes a syscall.
+     * @todo: Determine if this needs to be virtual.
+     */
+    virtual void syscall(int thread_num);
 
 #endif
 
-    void copyToXC();
-    void copyFromXC();
-
   public:
 #if FULL_SYSTEM
-    bool palShadowEnabled;
-
-    // Not sure this is used anywhere.
-    void intr_post(RegFile *regs, Fault fault, Addr pc);
-    // Actually used within exec files.  Implement properly.
-    void swapPALShadow(bool use_shadow);
-    // Called by CPU constructor.  Can implement as I please.
-    void initCPU(RegFile *regs);
-    // Called by initCPU.  Implement as I please.
-    void initIPRs(RegFile *regs);
-
+    /** Halts the CPU. */
     void halt() { panic("Halt not implemented!\n"); }
 #endif
 
-
+    /** Old CPU read from memory function. No longer used. */
     template <class T>
     Fault read(MemReqPtr &req, T &data)
     {
+//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
+#if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         if (req->flags & LOCKED) {
             req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
             req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
         }
 #endif
-
+#endif
         Fault error;
+        if (req->flags & LOCKED) {
+            lockAddr = req->paddr;
+            lockFlag = true;
+        }
+
         error = this->mem->read(req, data);
         data = gtoh(data);
         return error;
     }
 
+    /** CPU read function, forwards read to LSQ. */
     template <class T>
     Fault read(MemReqPtr &req, T &data, int load_idx)
     {
         return this->iew.ldstQueue.read(req, data, load_idx);
     }
 
+    /** Old CPU write to memory function. No longer used. */
     template <class T>
     Fault write(MemReqPtr &req, T &data)
     {
+#if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         ExecContext *xc;
 
@@ -276,16 +418,32 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         }
 
 #endif
+#endif
+
+        if (req->flags & LOCKED) {
+            if (req->flags & UNCACHEABLE) {
+                req->result = 2;
+            } else {
+                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                    req->result=1;
+                } else {
+                    req->result = 0;
+                }
+            }
+        }
 
         return this->mem->write(req, (T)htog(data));
     }
 
+    /** CPU write function, forwards write to LSQ. */
     template <class T>
     Fault write(MemReqPtr &req, T &data, int store_idx)
     {
         return this->iew.ldstQueue.write(req, data, store_idx);
     }
 
+    Addr lockAddr;
+    bool lockFlag;
 };
 
-#endif // __CPU_O3_CPU_ALPHA_FULL_CPU_HH__
+#endif // __CPU_O3_ALPHA_FULL_CPU_HH__
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 6025b8ef2..d676a69c1 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -26,39 +26,20 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "base/inifile.hh"
-#include "base/loader/symtab.hh"
-#include "base/misc.hh"
+#include <string>
+
 #include "cpu/base.hh"
-#include "cpu/exetrace.hh"
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_impl.hh"
-#include "mem/base_mem.hh"
+#include "cpu/o3/alpha_params.hh"
+#include "cpu/o3/fu_pool.hh"
 #include "mem/cache/base_cache.hh"
-#include "mem/mem_interface.hh"
 #include "sim/builder.hh"
-#include "sim/debug.hh"
-#include "sim/host.hh"
-#include "sim/process.hh"
-#include "sim/sim_events.hh"
-#include "sim/sim_object.hh"
-#include "sim/stats.hh"
-
-#if FULL_SYSTEM
-#include "base/remote_gdb.hh"
-#include "mem/functional/memory_control.hh"
-#include "mem/functional/physical.hh"
-#include "sim/system.hh"
-#include "arch/tlb.hh"
-#include "arch/vtophys.hh"
-#else // !FULL_SYSTEM
-#include "mem/functional/functional.hh"
-#endif // FULL_SYSTEM
 
 class DerivAlphaFullCPU : public AlphaFullCPU<AlphaSimpleImpl>
 {
   public:
-    DerivAlphaFullCPU(AlphaSimpleParams p)
+    DerivAlphaFullCPU(AlphaSimpleParams *p)
         : AlphaFullCPU<AlphaSimpleImpl>(p)
     { }
 };
@@ -75,7 +56,9 @@ SimObjectParam<AlphaITB *> itb;
 SimObjectParam<AlphaDTB *> dtb;
 #else
 SimObjectVectorParam<Process *> workload;
+//SimObjectParam<PageTable *> page_table;
 #endif // FULL_SYSTEM
+
 SimObjectParam<FunctionalMemory *> mem;
 
 Param<Counter> max_insts_any_thread;
@@ -86,6 +69,8 @@ Param<Counter> max_loads_all_threads;
 SimObjectParam<BaseCache *> icache;
 SimObjectParam<BaseCache *> dcache;
 
+Param<unsigned> cachePorts;
+
 Param<unsigned> decodeToFetchDelay;
 Param<unsigned> renameToFetchDelay;
 Param<unsigned> iewToFetchDelay;
@@ -112,25 +97,22 @@ Param<unsigned> executeIntWidth;
 Param<unsigned> executeFloatWidth;
 Param<unsigned> executeBranchWidth;
 Param<unsigned> executeMemoryWidth;
+SimObjectParam<FUPool *> fuPool;
 
 Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;
 
-#if 0
 Param<unsigned> localPredictorSize;
-Param<unsigned> localPredictorCtrBits;
-#endif
-Param<unsigned> local_predictor_size;
-Param<unsigned> local_ctr_bits;
-Param<unsigned> local_history_table_size;
-Param<unsigned> local_history_bits;
-Param<unsigned> global_predictor_size;
-Param<unsigned> global_ctr_bits;
-Param<unsigned> global_history_bits;
-Param<unsigned> choice_predictor_size;
-Param<unsigned> choice_ctr_bits;
+Param<unsigned> localCtrBits;
+Param<unsigned> localHistoryTableSize;
+Param<unsigned> localHistoryBits;
+Param<unsigned> globalPredictorSize;
+Param<unsigned> globalCtrBits;
+Param<unsigned> globalHistoryBits;
+Param<unsigned> choicePredictorSize;
+Param<unsigned> choiceCtrBits;
 
 Param<unsigned> BTBEntries;
 Param<unsigned> BTBTagSize;
@@ -147,6 +129,16 @@ Param<unsigned> numPhysFloatRegs;
 Param<unsigned> numIQEntries;
 Param<unsigned> numROBEntries;
 
+Param<unsigned> smtNumFetchingThreads;
+Param<std::string>   smtFetchPolicy;
+Param<std::string>   smtLSQPolicy;
+Param<unsigned> smtLSQThreshold;
+Param<std::string>   smtIQPolicy;
+Param<unsigned> smtIQThreshold;
+Param<std::string>   smtROBPolicy;
+Param<unsigned> smtROBThreshold;
+Param<std::string>   smtCommitPolicy;
+
 Param<unsigned> instShiftAmt;
 
 Param<bool> defer_registration;
@@ -168,6 +160,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(dtb, "Data translation buffer"),
 #else
     INIT_PARAM(workload, "Processes to run"),
+//    INIT_PARAM(page_table, "Page table"),
 #endif // FULL_SYSTEM
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
@@ -190,13 +183,14 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
     INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
 
+    INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200),
+
     INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
     INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
     INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
                "delay"),
     INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
     INIT_PARAM(fetchWidth, "Fetch width"),
-
     INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
     INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
                "delay"),
@@ -222,6 +216,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(executeFloatWidth, "Floating point execute width"),
     INIT_PARAM(executeBranchWidth, "Branch execute width"),
     INIT_PARAM(executeMemoryWidth, "Memory execute width"),
+    INIT_PARAM_DFLT(fuPool, "Functional unit pool", NULL),
 
     INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
                "delay"),
@@ -229,20 +224,15 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(commitWidth, "Commit width"),
     INIT_PARAM(squashWidth, "Squash width"),
 
-#if 0
-    INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. "
-               "Must be a power of 2."),
-    INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"),
-#endif
-    INIT_PARAM(local_predictor_size, "Size of local predictor"),
-    INIT_PARAM(local_ctr_bits, "Bits per counter"),
-    INIT_PARAM(local_history_table_size, "Size of local history table"),
-    INIT_PARAM(local_history_bits, "Bits for the local history"),
-    INIT_PARAM(global_predictor_size, "Size of global predictor"),
-    INIT_PARAM(global_ctr_bits, "Bits per counter"),
-    INIT_PARAM(global_history_bits, "Bits of history"),
-    INIT_PARAM(choice_predictor_size, "Size of choice predictor"),
-    INIT_PARAM(choice_ctr_bits, "Bits of choice counters"),
+    INIT_PARAM(localPredictorSize, "Size of local predictor"),
+    INIT_PARAM(localCtrBits, "Bits per counter"),
+    INIT_PARAM(localHistoryTableSize, "Size of local history table"),
+    INIT_PARAM(localHistoryBits, "Bits for the local history"),
+    INIT_PARAM(globalPredictorSize, "Size of global predictor"),
+    INIT_PARAM(globalCtrBits, "Bits per counter"),
+    INIT_PARAM(globalHistoryBits, "Bits of history"),
+    INIT_PARAM(choicePredictorSize, "Size of choice predictor"),
+    INIT_PARAM(choiceCtrBits, "Bits of choice counters"),
 
     INIT_PARAM(BTBEntries, "Number of BTB entries"),
     INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
@@ -260,6 +250,16 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
     INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
 
+    INIT_PARAM_DFLT(smtNumFetchingThreads, "SMT Number of Fetching Threads", 1),
+    INIT_PARAM_DFLT(smtFetchPolicy, "SMT Fetch Policy", "SingleThread"),
+    INIT_PARAM_DFLT(smtLSQPolicy,   "SMT LSQ Sharing Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtLSQThreshold,"SMT LSQ Threshold", 100),
+    INIT_PARAM_DFLT(smtIQPolicy,    "SMT IQ Policy",    "Partitioned"),
+    INIT_PARAM_DFLT(smtIQThreshold, "SMT IQ Threshold", 100),
+    INIT_PARAM_DFLT(smtROBPolicy,   "SMT ROB Sharing Policy", "Partitioned"),
+    INIT_PARAM_DFLT(smtROBThreshold,"SMT ROB Threshold", 100),
+    INIT_PARAM_DFLT(smtCommitPolicy,"SMT Commit Fetch Policy", "RoundRobin"),
+
     INIT_PARAM(instShiftAmt, "Number of bits to shift instructions by"),
     INIT_PARAM(defer_registration, "defer system registration (for sampling)"),
 
@@ -287,101 +287,113 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
 
 #endif
 
-    AlphaSimpleParams params;
+    AlphaSimpleParams *params = new AlphaSimpleParams;
 
-    params.clock = clock;
+    params->clock = clock;
 
-    params.name = getInstanceName();
-    params.numberOfThreads = actual_num_threads;
+    params->name = getInstanceName();
+    params->numberOfThreads = actual_num_threads;
 
 #if FULL_SYSTEM
-    params.system = system;
-    params.cpu_id = cpu_id;
-    params.itb = itb;
-    params.dtb = dtb;
+    params->system = system;
+    params->cpu_id = cpu_id;
+    params->itb = itb;
+    params->dtb = dtb;
 #else
-    params.workload = workload;
+    params->workload = workload;
+    //@todo: change to pageTable
+//    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
-    params.mem = mem;
+    params->mem = mem;
 
-    params.max_insts_any_thread = max_insts_any_thread;
-    params.max_insts_all_threads = max_insts_all_threads;
-    params.max_loads_any_thread = max_loads_any_thread;
-    params.max_loads_all_threads = max_loads_all_threads;
+    params->max_insts_any_thread = max_insts_any_thread;
+    params->max_insts_all_threads = max_insts_all_threads;
+    params->max_loads_any_thread = max_loads_any_thread;
+    params->max_loads_all_threads = max_loads_all_threads;
 
     //
     // Caches
     //
-    params.icacheInterface = icache ? icache->getInterface() : NULL;
-    params.dcacheInterface = dcache ? dcache->getInterface() : NULL;
-
-    params.decodeToFetchDelay = decodeToFetchDelay;
-    params.renameToFetchDelay = renameToFetchDelay;
-    params.iewToFetchDelay = iewToFetchDelay;
-    params.commitToFetchDelay = commitToFetchDelay;
-    params.fetchWidth = fetchWidth;
-
-    params.renameToDecodeDelay = renameToDecodeDelay;
-    params.iewToDecodeDelay = iewToDecodeDelay;
-    params.commitToDecodeDelay = commitToDecodeDelay;
-    params.fetchToDecodeDelay = fetchToDecodeDelay;
-    params.decodeWidth = decodeWidth;
-
-    params.iewToRenameDelay = iewToRenameDelay;
-    params.commitToRenameDelay = commitToRenameDelay;
-    params.decodeToRenameDelay = decodeToRenameDelay;
-    params.renameWidth = renameWidth;
-
-    params.commitToIEWDelay = commitToIEWDelay;
-    params.renameToIEWDelay = renameToIEWDelay;
-    params.issueToExecuteDelay = issueToExecuteDelay;
-    params.issueWidth = issueWidth;
-    params.executeWidth = executeWidth;
-    params.executeIntWidth = executeIntWidth;
-    params.executeFloatWidth = executeFloatWidth;
-    params.executeBranchWidth = executeBranchWidth;
-    params.executeMemoryWidth = executeMemoryWidth;
-
-    params.iewToCommitDelay = iewToCommitDelay;
-    params.renameToROBDelay = renameToROBDelay;
-    params.commitWidth = commitWidth;
-    params.squashWidth = squashWidth;
-#if 0
-    params.localPredictorSize = localPredictorSize;
-    params.localPredictorCtrBits = localPredictorCtrBits;
-#endif
-    params.local_predictor_size = local_predictor_size;
-    params.local_ctr_bits = local_ctr_bits;
-    params.local_history_table_size = local_history_table_size;
-    params.local_history_bits = local_history_bits;
-    params.global_predictor_size = global_predictor_size;
-    params.global_ctr_bits = global_ctr_bits;
-    params.global_history_bits = global_history_bits;
-    params.choice_predictor_size = choice_predictor_size;
-    params.choice_ctr_bits = choice_ctr_bits;
-
-    params.BTBEntries = BTBEntries;
-    params.BTBTagSize = BTBTagSize;
-
-    params.RASSize = RASSize;
-
-    params.LQEntries = LQEntries;
-    params.SQEntries = SQEntries;
-    params.SSITSize = SSITSize;
-    params.LFSTSize = LFSTSize;
-
-    params.numPhysIntRegs = numPhysIntRegs;
-    params.numPhysFloatRegs = numPhysFloatRegs;
-    params.numIQEntries = numIQEntries;
-    params.numROBEntries = numROBEntries;
-
-    params.instShiftAmt = 2;
-
-    params.defReg = defer_registration;
-
-    params.functionTrace = function_trace;
-    params.functionTraceStart = function_trace_start;
+    params->icacheInterface = icache ? icache->getInterface() : NULL;
+    params->dcacheInterface = dcache ? dcache->getInterface() : NULL;
+    params->cachePorts = cachePorts;
+
+    params->decodeToFetchDelay = decodeToFetchDelay;
+    params->renameToFetchDelay = renameToFetchDelay;
+    params->iewToFetchDelay = iewToFetchDelay;
+    params->commitToFetchDelay = commitToFetchDelay;
+    params->fetchWidth = fetchWidth;
+
+    params->renameToDecodeDelay = renameToDecodeDelay;
+    params->iewToDecodeDelay = iewToDecodeDelay;
+    params->commitToDecodeDelay = commitToDecodeDelay;
+    params->fetchToDecodeDelay = fetchToDecodeDelay;
+    params->decodeWidth = decodeWidth;
+
+    params->iewToRenameDelay = iewToRenameDelay;
+    params->commitToRenameDelay = commitToRenameDelay;
+    params->decodeToRenameDelay = decodeToRenameDelay;
+    params->renameWidth = renameWidth;
+
+    params->commitToIEWDelay = commitToIEWDelay;
+    params->renameToIEWDelay = renameToIEWDelay;
+    params->issueToExecuteDelay = issueToExecuteDelay;
+    params->issueWidth = issueWidth;
+    params->executeWidth = executeWidth;
+    params->executeIntWidth = executeIntWidth;
+    params->executeFloatWidth = executeFloatWidth;
+    params->executeBranchWidth = executeBranchWidth;
+    params->executeMemoryWidth = executeMemoryWidth;
+    params->fuPool = fuPool;
+
+    params->iewToCommitDelay = iewToCommitDelay;
+    params->renameToROBDelay = renameToROBDelay;
+    params->commitWidth = commitWidth;
+    params->squashWidth = squashWidth;
+
+
+    params->localPredictorSize = localPredictorSize;
+    params->localCtrBits = localCtrBits;
+    params->localHistoryTableSize = localHistoryTableSize;
+    params->localHistoryBits = localHistoryBits;
+    params->globalPredictorSize = globalPredictorSize;
+    params->globalCtrBits = globalCtrBits;
+    params->globalHistoryBits = globalHistoryBits;
+    params->choicePredictorSize = choicePredictorSize;
+    params->choiceCtrBits = choiceCtrBits;
+
+    params->BTBEntries = BTBEntries;
+    params->BTBTagSize = BTBTagSize;
+
+    params->RASSize = RASSize;
+
+    params->LQEntries = LQEntries;
+    params->SQEntries = SQEntries;
+
+    params->SSITSize = SSITSize;
+    params->LFSTSize = LFSTSize;
+
+    params->numPhysIntRegs = numPhysIntRegs;
+    params->numPhysFloatRegs = numPhysFloatRegs;
+    params->numIQEntries = numIQEntries;
+    params->numROBEntries = numROBEntries;
+
+    params->smtNumFetchingThreads = smtNumFetchingThreads;
+    params->smtFetchPolicy = smtFetchPolicy;
+    params->smtIQPolicy    = smtIQPolicy;
+    params->smtLSQPolicy    = smtLSQPolicy;
+    params->smtLSQThreshold = smtLSQThreshold;
+    params->smtROBPolicy   = smtROBPolicy;
+    params->smtROBThreshold = smtROBThreshold;
+    params->smtCommitPolicy = smtCommitPolicy;
+
+    params->instShiftAmt = 2;
+
+    params->deferRegistration = defer_registration;
+
+    params->functionTrace = function_trace;
+    params->functionTraceStart = function_trace_start;
 
     cpu = new DerivAlphaFullCPU(params);
 
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 9f1fa24f6..86f7d9f28 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -30,6 +30,7 @@
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/quiesce_event.hh"
 #include "mem/cache/cache.hh" // for dynamic cast
 #include "mem/mem_interface.hh"
 #include "sim/builder.hh"
@@ -39,18 +40,79 @@
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_params.hh"
 #include "cpu/o3/comm.hh"
+#include "cpu/o3/thread_state.hh"
 
 #if FULL_SYSTEM
 #include "arch/alpha/osfpal.hh"
-#include "arch/alpha/isa_traits.hh"
+#include "arch/isa_traits.hh"
 #endif
 
+using namespace TheISA;
+
 template <class Impl>
-AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
+AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
+#if FULL_SYSTEM
+    : FullO3CPU<Impl>(params), itb(params->itb), dtb(params->dtb)
+#else
     : FullO3CPU<Impl>(params)
+#endif
 {
     DPRINTF(FullCPU, "AlphaFullCPU: Creating AlphaFullCPU object.\n");
 
+    this->thread.resize(this->numThreads);
+
+    for (int i = 0; i < this->numThreads; ++i) {
+#if FULL_SYSTEM
+        assert(i == 0);
+        this->thread[i] = new Thread(this, 0, params->mem);
+//        this->system->execContexts[i] = this->thread[i]->getXCProxy();
+        this->thread[i]->setStatus(ExecContext::Suspended);
+
+#else
+        if (i < params->workload.size()) {
+            DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
+                    "process is %#x",
+                    i, params->workload[i]->prog_entry, this->thread[i]);
+            this->thread[i] = new Thread(this, i, params->workload[i], i);
+            assert(params->workload[i]->getMemory() != NULL);
+
+            this->thread[i]->setStatus(ExecContext::Suspended);
+            //usedTids[i] = true;
+            //threadMap[i] = i;
+        } else {
+            //Allocate Empty execution context so M5 can use later
+            //when scheduling threads to CPU
+            Process* dummy_proc = NULL;
+
+            this->thread[i] = new Thread(this, i, dummy_proc, i);
+            //usedTids[i] = false;
+        }
+#endif // !FULL_SYSTEM
+
+        this->thread[i]->numInst = 0;
+
+        xcProxies.push_back(new AlphaXC);
+
+        xcProxies[i]->cpu = this;
+        xcProxies[i]->thread = this->thread[i];
+
+        xcProxies[i]->quiesceEvent = new EndQuiesceEvent(xcProxies[i]);
+        xcProxies[i]->lastActivate = 0;
+        xcProxies[i]->lastSuspend = 0;
+
+
+        this->thread[i]->xcProxy = xcProxies[i];
+
+        this->execContexts.push_back(this->thread[i]->getXCProxy());
+    }
+
+
+    for (int i=0; i < this->numThreads; i++) {
+        this->thread[i]->funcExeInst = 0;
+    }
+
+    // Sets CPU pointers. These must be set at this level because the CPU
+    // pointers are defined to be the highest level of CPU class.
     this->fetch.setCPU(this);
     this->decode.setCPU(this);
     this->rename.setCPU(this);
@@ -58,6 +120,10 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
     this->commit.setCPU(this);
 
     this->rob.setCPU(this);
+    this->regFile.setCPU(this);
+
+    lockAddr = 0;
+    lockFlag = false;
 }
 
 template <class Impl>
@@ -73,182 +139,436 @@ AlphaFullCPU<Impl>::regStats()
     this->commit.regStats();
 }
 
-#if !FULL_SYSTEM
+#if FULL_SYSTEM
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::dumpFuncProfile()
+{
+}
+#endif
 
-// Will probably need to know which thread is calling syscall
-// Will need to pass that information in to the DynInst when it is constructed,
-// so that this call can be made with the proper thread number.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::syscall(short thread_num)
+AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
 {
-    DPRINTF(FullCPU, "AlphaFullCPU: Syscall() called.\n\n");
+}
 
-    // Commit stage needs to run as well.
-    this->commit.tick();
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
+{
+    DPRINTF(FullCPU, "Calling activate on AlphaXC\n");
+//    warn("Calling activate on AlphaXC");
+    if (thread->status() == ExecContext::Active)
+        return;
 
-    squashStages();
+    lastActivate = curTick;
 
-    // Temporarily increase this by one to account for the syscall
-    // instruction.
-    ++(this->funcExeInst);
+    if (thread->status() == ExecContext::Unallocated) {
+        cpu->activateWhenReady(thread->tid);
+        return;
+    }
 
-    // Copy over all important state to xc once all the unrolling is done.
-    copyToXC();
+    thread->setStatus(ExecContext::Active);
 
-    // This is hardcoded to thread 0 while the CPU is only single threaded.
-    this->thread[0]->syscall();
+    // status() == Suspended
+    cpu->activateContext(thread->tid, delay);
+}
 
-    // Copy over all important state back to CPU.
-    copyFromXC();
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::suspend()
+{
+    DPRINTF(FullCPU, "Calling suspend on AlphaXC\n");
+//    warn("Calling suspend on AlphaXC");
+    if (thread->status() == ExecContext::Suspended)
+        return;
 
-    // Decrease funcExeInst by one as the normal commit will handle
-    // incrememnting it.
-    --(this->funcExeInst);
+    lastActivate = curTick;
+    lastSuspend = curTick;
+/*
+#if FULL_SYSTEM
+    // Don't change the status from active if there are pending interrupts
+    if (cpu->check_interrupts()) {
+        assert(status() == ExecContext::Active);
+        return;
+    }
+#endif
+*/
+    thread->setStatus(ExecContext::Suspended);
+    cpu->suspendContext(thread->tid);
 }
 
-// This is not a pretty function, and should only be used if it is necessary
-// to fake having everything squash all at once (ie for non-full system
-// syscalls).  Maybe put this at the FullCPU level?
 template <class Impl>
 void
-AlphaFullCPU<Impl>::squashStages()
+AlphaFullCPU<Impl>::AlphaXC::deallocate()
 {
-    InstSeqNum rob_head = this->rob.readHeadSeqNum();
+    DPRINTF(FullCPU, "Calling deallocate on AlphaXC\n");
+//    warn("Calling deallocate on AlphaXC");
+    if (thread->status() == ExecContext::Unallocated)
+        return;
 
-    // Now hack the time buffer to put this sequence number in the places
-    // where the stages might read it.
-    for (int i = 0; i < 5; ++i)
-    {
-        this->timeBuffer.access(-i)->commitInfo.doneSeqNum = rob_head;
-    }
+    thread->setStatus(ExecContext::Unallocated);
+    cpu->deallocateContext(thread->tid);
+}
 
-    this->fetch.squash(this->rob.readHeadNextPC());
-    this->fetchQueue.advance();
-
-    this->decode.squash();
-    this->decodeQueue.advance();
-
-    this->rename.squash();
-    this->renameQueue.advance();
-    this->renameQueue.advance();
-
-    // Be sure to advance the IEW queues so that the commit stage doesn't
-    // try to set an instruction as completed at the same time that it
-    // might be deleting it.
-    this->iew.squash();
-    this->iewQueue.advance();
-    this->iewQueue.advance();
-    // Needs to tell the LSQ to write back all of its data
-    this->iew.lsqWriteback();
-
-    this->rob.squash(rob_head);
-    this->commit.setSquashing();
-
-    // Now hack the time buffer to clear the sequence numbers in the places
-    // where the stages might read it.?
-    for (int i = 0; i < 5; ++i)
-    {
-        this->timeBuffer.access(-i)->commitInfo.doneSeqNum = 0;
-    }
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::halt()
+{
+    DPRINTF(FullCPU, "Calling halt on AlphaXC\n");
+//    warn("Calling halt on AlphaXC");
+    if (thread->status() == ExecContext::Halted)
+        return;
 
+    thread->setStatus(ExecContext::Halted);
+    cpu->haltContext(thread->tid);
 }
 
-#endif // FULL_SYSTEM
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::regStats(const std::string &name)
+{}
 
 template <class Impl>
 void
-AlphaFullCPU<Impl>::copyToXC()
-{
-    PhysRegIndex renamed_reg;
+AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
+{}
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
+{}
 
-    // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i);
-        this->cpuXC->setIntReg(i, this->regFile.readIntReg(renamed_reg));
-        DPRINTF(FullCPU, "FullCPU: Copying register %i, has data %lli.\n",
-                renamed_reg, this->regFile.intRegFile[renamed_reg]);
-    }
+#if FULL_SYSTEM
+template <class Impl>
+Event *
+AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
+{
+    return quiesceEvent;
+}
 
-    // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        this->cpuXC->setFloatRegDouble(i,
-            this->regFile.readFloatRegDouble(renamed_reg));
-        this->cpuXC->setFloatRegInt(i,
-            this->regFile.readFloatRegInt(renamed_reg));
-    }
+template <class Impl>
+Tick
+AlphaFullCPU<Impl>::AlphaXC::readLastActivate()
+{
+    return lastActivate;
+}
 
-    this->cpuXC->setMiscReg(AlphaISA::Fpcr_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Fpcr_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Uniq_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Uniq_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Lock_Flag_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Lock_Flag_DepTag));
-    this->cpuXC->setMiscReg(AlphaISA::Lock_Addr_DepTag,
-                            this->regFile.readMiscReg(AlphaISA::Lock_Addr_DepTag));
+template <class Impl>
+Tick
+AlphaFullCPU<Impl>::AlphaXC::readLastSuspend()
+{
+    return lastSuspend;
+}
 
-    this->cpuXC->setPC(this->rob.readHeadPC());
-    this->cpuXC->setNextPC(this->cpuXC->readPC()+4);
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::profileClear()
+{}
 
-#if !FULL_SYSTEM
-    this->cpuXC->setFuncExeInst(this->funcExeInst);
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::profileSample()
+{}
 #endif
+
+template <class Impl>
+TheISA::MachInst
+AlphaFullCPU<Impl>::AlphaXC:: getInst()
+{
+    return thread->inst;
 }
 
-// This function will probably mess things up unless the ROB is empty and
-// there are no instructions in the pipeline.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::copyFromXC()
+AlphaFullCPU<Impl>::AlphaXC::copyArchRegs(ExecContext *xc)
 {
+    // This function will mess things up unless the ROB is empty and
+    // there are no instructions in the pipeline.
+    unsigned tid = thread->tid;
     PhysRegIndex renamed_reg;
 
     // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i);
+    for (int i = 0; i < AlphaISA::NumIntRegs; ++i) {
+        renamed_reg = cpu->renameMap[tid].lookup(i);
 
         DPRINTF(FullCPU, "FullCPU: Copying over register %i, had data %lli, "
                 "now has data %lli.\n",
-                renamed_reg, this->regFile.intRegFile[renamed_reg],
-                this->cpuXC->readIntReg(i));
+                renamed_reg, cpu->readIntReg(renamed_reg),
+                xc->readIntReg(i));
 
-        this->regFile.setIntReg(renamed_reg, this->cpuXC->readIntReg(i));
+        cpu->setIntReg(renamed_reg, xc->readIntReg(i));
     }
 
     // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = this->renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        this->regFile.setFloatRegDouble(renamed_reg,
-                                        this->cpuXC->readFloatRegDouble(i));
-        this->regFile.setFloatRegInt(renamed_reg,
-                                     this->cpuXC->readFloatRegInt(i));
+    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i) {
+        renamed_reg = cpu->renameMap[tid].lookup(i + AlphaISA::FP_Base_DepTag);
+        cpu->setFloatRegDouble(renamed_reg,
+                               xc->readFloatRegDouble(i));
+        cpu->setFloatRegInt(renamed_reg,
+                            xc->readFloatRegInt(i));
     }
 
-    // Then loop through the misc registers.
-    this->regFile.setMiscReg(AlphaISA::Fpcr_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Fpcr_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Uniq_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Uniq_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Lock_Flag_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Lock_Flag_DepTag));
-    this->regFile.setMiscReg(AlphaISA::Lock_Addr_DepTag,
-                             this->cpuXC->readMiscReg(AlphaISA::Lock_Addr_DepTag));
+    // Copy the misc regs.
+    cpu->regFile.miscRegs[tid].copyMiscRegs(xc);
 
     // Then finally set the PC and the next PC.
-//    regFile.pc = cpuXC->regs.pc;
-//    regFile.npc = cpuXC->regs.npc;
+    cpu->setPC(xc->readPC(), tid);
+    cpu->setNextPC(xc->readNextPC(), tid);
 #if !FULL_SYSTEM
-    this->funcExeInst = this->cpuXC->readFuncExeInst();
+    this->thread->funcExeInst = xc->readFuncExeInst();
 #endif
 }
 
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::clearArchRegs()
+{}
+
+//
+// New accessors for new decoder.
+//
+template <class Impl>
+uint64_t
+AlphaFullCPU<Impl>::AlphaXC::readIntReg(int reg_idx)
+{
+    DPRINTF(Fault, "Reading int register through the XC!\n");
+    return cpu->readArchIntReg(reg_idx, thread->tid);
+}
+
+template <class Impl>
+float
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegSingle(int reg_idx)
+{
+    DPRINTF(Fault, "Reading float register through the XC!\n");
+    return cpu->readArchFloatRegSingle(reg_idx, thread->tid);
+}
+
+template <class Impl>
+double
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegDouble(int reg_idx)
+{
+    DPRINTF(Fault, "Reading float register through the XC!\n");
+    return cpu->readArchFloatRegDouble(reg_idx, thread->tid);
+}
+
+template <class Impl>
+uint64_t
+AlphaFullCPU<Impl>::AlphaXC::readFloatRegInt(int reg_idx)
+{
+    DPRINTF(Fault, "Reading floatint register through the XC!\n");
+    return cpu->readArchFloatRegInt(reg_idx, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setIntReg(int reg_idx, uint64_t val)
+{
+    DPRINTF(Fault, "Setting int register through the XC!\n");
+    cpu->setArchIntReg(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegSingle(int reg_idx, float val)
+{
+    DPRINTF(Fault, "Setting float register through the XC!\n");
+    cpu->setArchFloatRegSingle(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegDouble(int reg_idx, double val)
+{
+    DPRINTF(Fault, "Setting float register through the XC!\n");
+    cpu->setArchFloatRegDouble(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setFloatRegInt(int reg_idx, uint64_t val)
+{
+    DPRINTF(Fault, "Setting floatint register through the XC!\n");
+    cpu->setArchFloatRegInt(reg_idx, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setPC(uint64_t val)
+{
+    cpu->setPC(val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setNextPC(uint64_t val)
+{
+    cpu->setNextPC(val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::AlphaXC::setMiscReg(int misc_reg, const MiscReg &val)
+{
+    DPRINTF(Fault, "Setting misc register through the XC!\n");
+
+    Fault ret_fault = cpu->setMiscReg(misc_reg, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::AlphaXC::setMiscRegWithEffect(int misc_reg, const MiscReg &val)
+{
+    DPRINTF(Fault, "Setting misc register through the XC!\n");
+
+    Fault ret_fault = cpu->setMiscRegWithEffect(misc_reg, val, thread->tid);
+
+    if (!thread->trapPending && !thread->inSyscall) {
+        cpu->squashFromXC(thread->tid);
+    }
+
+    return ret_fault;
+}
+
+#if !FULL_SYSTEM
+
+template <class Impl>
+TheISA::IntReg
+AlphaFullCPU<Impl>::AlphaXC::getSyscallArg(int i)
+{
+    return cpu->getSyscallArg(i, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setSyscallArg(int i, IntReg val)
+{
+    cpu->setSyscallArg(i, val, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::AlphaXC::setSyscallReturn(SyscallReturn return_value)
+{
+    cpu->setSyscallReturn(return_value, thread->tid);
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::syscall(int tid)
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
+
+    DPRINTF(Activity,"Activity: syscall() called.\n");
+
+    // Temporarily increase this by one to account for the syscall
+    // instruction.
+    ++(this->thread[tid]->funcExeInst);
+
+    // Execute the actual syscall.
+    this->thread[tid]->syscall();
+
+    // Decrease funcExeInst by one as the normal commit will handle
+    // incrementing it.
+    --(this->thread[tid]->funcExeInst);
+}
+
+#endif // FULL_SYSTEM
+
+template <class Impl>
+MiscReg
+AlphaFullCPU<Impl>::readMiscReg(int misc_reg, unsigned tid)
+{
+    return this->regFile.readMiscReg(misc_reg, tid);
+}
+
+template <class Impl>
+MiscReg
+AlphaFullCPU<Impl>::readMiscRegWithEffect(int misc_reg, Fault &fault,
+                                          unsigned tid)
+{
+    return this->regFile.readMiscRegWithEffect(misc_reg, fault, tid);
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::setMiscReg(int misc_reg, const MiscReg &val, unsigned tid)
+{
+    // I think that these registers should always be set, regardless of what
+    // mode the thread is in.  The main difference is if the thread needs to
+    // squash as a result of the write, which is controlled by the AlphaXC.
+//    if (!this->thread[tid]->trapPending) {
+        return this->regFile.setMiscReg(misc_reg, val, tid);
+//    } else {
+//        return NoFault;
+//    }
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::setMiscRegWithEffect(int misc_reg, const MiscReg &val,
+                                         unsigned tid)
+{
+//    if (!this->thread[tid]->trapPending) {
+        return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
+//    } else {
+//        return NoFault;
+//    }
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::squashFromXC(unsigned tid)
+{
+//    this->thread[tid]->trapPending = true;
+    this->thread[tid]->inSyscall = true;
+    this->commit.generateXCEvent(tid);
+}
+
 #if FULL_SYSTEM
 
+template <class Impl>
+void
+AlphaFullCPU<Impl>::post_interrupt(int int_num, int index)
+{
+    BaseCPU::post_interrupt(int_num, index);
+
+    if (this->thread[0]->status() == ExecContext::Suspended) {
+        DPRINTF(IPI,"Suspended Processor awoke\n");
+        xcProxies[0]->activate();
+    }
+}
+
 template <class Impl>
 int
 AlphaFullCPU<Impl>::readIntrFlag()
@@ -263,23 +583,26 @@ AlphaFullCPU<Impl>::setIntrFlag(int val)
     this->regFile.setIntrFlag(val);
 }
 
-// Can force commit stage to squash and stuff.
 template <class Impl>
 Fault
-AlphaFullCPU<Impl>::hwrei()
+AlphaFullCPU<Impl>::hwrei(unsigned tid)
 {
-    if (!inPalMode())
+#if 0
+    if (!inPalMode(this->readPC(tid)))
         return new AlphaISA::UnimplementedOpcodeFault;
 
-    this->setNextPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR));
+    setNextPC(cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR, tid), tid);
 
-//    kernelStats.hwrei();
+    cpu->kernelStats->hwrei();
 
-    if ((this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
+//    if ((this->regFile.miscRegs[tid].readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
 //        AlphaISA::swap_palshadow(&regs, false);
 
-    this->checkInterrupts = true;
-
+    cpu->checkInterrupts = true;
+#endif
+//    panic("Do not call this function!");
+    // Need to clear the lock flag upon returning from an interrupt.
+    this->lockFlag = false;
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
@@ -312,8 +635,10 @@ AlphaFullCPU<Impl>::simPalCheck(int palFunc)
 // stage.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::trap(Fault fault)
+AlphaFullCPU<Impl>::trap(Fault fault, unsigned tid)
 {
+
+    fault->invoke(this->xcProxies[tid]);
 /*    // Keep in mind that a trap may be initiated by fetch if there's a TLB
     // miss
     uint64_t PC = this->commit.readCommitPC();
@@ -344,32 +669,93 @@ AlphaFullCPU<Impl>::trap(Fault fault)
         swapPALShadow(true);
 
     this->regFile.setPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_PAL_BASE) +
-                         (dynamic_cast<AlphaFault *>(fault.get()))->vect());
-    this->regFile.setNextPC(PC + sizeof(MachInst));*/
+                         (dynamic_cast<AlphaFault *>(fault.get()))->vect(), 0);
+    this->regFile.setNextPC(PC + sizeof(MachInst), 0);*/
 }
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::processInterrupts()
 {
-    // Check for interrupts here.  For now can copy the code that exists
-    // within isa_fullsys_traits.hh.
+    // Check for interrupts here.  For now can copy the code that
+    // exists within isa_fullsys_traits.hh.  Also assume that thread 0
+    // is the one that handles the interrupts.
+
+    // Check if there are any outstanding interrupts
+    //Handle the interrupts
+    int ipl = 0;
+    int summary = 0;
+
+    this->checkInterrupts = false;
+
+    if (this->readMiscReg(IPR_ASTRR, 0))
+        panic("asynchronous traps not implemented\n");
+
+    if (this->readMiscReg(IPR_SIRR, 0)) {
+        for (int i = INTLEVEL_SOFTWARE_MIN;
+             i < INTLEVEL_SOFTWARE_MAX; i++) {
+            if (this->readMiscReg(IPR_SIRR, 0) & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = (i - INTLEVEL_SOFTWARE_MIN) + 1;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    uint64_t interrupts = this->intr_status();
+
+    if (interrupts) {
+        for (int i = INTLEVEL_EXTERNAL_MIN;
+             i < INTLEVEL_EXTERNAL_MAX; i++) {
+            if (interrupts & (ULL(1) << i)) {
+                // See table 4-19 of the 21164 hardware reference
+                ipl = i;
+                summary |= (ULL(1) << i);
+            }
+        }
+    }
+
+    if (ipl && ipl > this->readMiscReg(IPR_IPLR, 0)) {
+        this->setMiscReg(IPR_ISR, summary, 0);
+        this->setMiscReg(IPR_INTID, ipl, 0);
+        this->trap(Fault(new InterruptFault), 0);
+        DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
+                this->readMiscReg(IPR_IPLR, 0), ipl, summary);
+    }
+}
+
+#endif // FULL_SYSTEM
+
+#if !FULL_SYSTEM
+template <class Impl>
+TheISA::IntReg
+AlphaFullCPU<Impl>::getSyscallArg(int i, int tid)
+{
+    return this->readArchIntReg(AlphaISA::ArgumentReg0 + i, tid);
 }
 
-// swap_palshadow swaps in the values of the shadow registers and
-// swaps them with the values of the physical registers that map to the
-// same logical index.
 template <class Impl>
 void
-AlphaFullCPU<Impl>::swapPALShadow(bool use_shadow)
+AlphaFullCPU<Impl>::setSyscallArg(int i, IntReg val, int tid)
 {
-    if (palShadowEnabled == use_shadow)
-        panic("swap_palshadow: wrong PAL shadow state");
-
-    palShadowEnabled = use_shadow;
-
-    // Will have to lookup in rename map to get physical registers, then
-    // swap.
+    this->setArchIntReg(AlphaISA::ArgumentReg0 + i, val, tid);
 }
 
-#endif // FULL_SYSTEM
+template <class Impl>
+void
+AlphaFullCPU<Impl>::setSyscallReturn(SyscallReturn return_value, int tid)
+{
+    // check for error condition.  Alpha syscall convention is to
+    // indicate success/failure in reg a3 (r19) and put the
+    // return value itself in the standard return value reg (v0).
+    if (return_value.successful()) {
+        // no error
+        this->setArchIntReg(SyscallSuccessReg, 0, tid);
+        this->setArchIntReg(ReturnValueReg, return_value.value(), tid);
+    } else {
+        // got an error, return details
+        this->setArchIntReg(SyscallSuccessReg, (IntReg) -1, tid);
+        this->setArchIntReg(ReturnValueReg, -return_value.value(), tid);
+    }
+}
+#endif
diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index e7f7d3a57..e0b73f17e 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -26,21 +26,24 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_DYN_INST_HH__
-#define __CPU_O3_CPU_ALPHA_DYN_INST_HH__
+#ifndef __CPU_O3_ALPHA_DYN_INST_HH__
+#define __CPU_O3_ALPHA_DYN_INST_HH__
 
 #include "cpu/base_dyn_inst.hh"
+#include "cpu/inst_seq.hh"
 #include "cpu/o3/alpha_cpu.hh"
 #include "cpu/o3/alpha_impl.hh"
-#include "cpu/inst_seq.hh"
 
 /**
- * Mostly implementation specific AlphaDynInst.  It is templated in case there
- * are other implementations that are similar enough to be able to use this
- * class without changes.  This is mainly useful if there are multiple similar
- * CPU implementations of the same ISA.
+ * Mostly implementation & ISA specific AlphaDynInst. As with most other classes
+ * in the new CPU model, it is templated on the Impl to allow for passing in of
+ * all types, such as the CPU type and the ISA type. The AlphaDynInst serves
+ * as the primary interface to the CPU; it plays the role that the ExecContext
+ * does for the old CPU and the SimpleCPU. The goal is to abstract ExecContext
+ * purely into an interface, and have it forward calls to the appropriate
+ * CPU interface, which in the new CPU model's case would be this AlphaDynInst,
+ * or any other high level implementation specific DynInst.
  */
-
 template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
 {
@@ -50,6 +53,8 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
     /** Binary machine instruction type. */
     typedef TheISA::MachInst MachInst;
+    /** Extended machine instruction type. */
+    typedef TheISA::ExtMachInst ExtMachInst;
     /** Logical register index type. */
     typedef TheISA::RegIndex RegIndex;
     /** Integer register index type. */
@@ -64,55 +69,60 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
   public:
     /** BaseDynInst constructor given a binary instruction. */
-    AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
+    AlphaDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
                  FullCPU *cpu);
 
     /** BaseDynInst constructor given a static inst pointer. */
     AlphaDynInst(StaticInstPtr &_staticInst);
 
     /** Executes the instruction.*/
-    Fault execute()
-    {
-        return this->fault = this->staticInst->execute(this, this->traceData);
-    }
+    Fault execute();
+
+    Fault initiateAcc();
+
+    Fault completeAcc();
+
+  private:
+    /** Initializes variables. */
+    void initVars();
 
   public:
     MiscReg readMiscReg(int misc_reg)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
+        return this->cpu->readMiscReg(misc_reg, this->threadNumber);
     }
 
     MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return 0;
+        return this->cpu->readMiscRegWithEffect(misc_reg, fault,
+                                                this->threadNumber);
     }
 
     Fault setMiscReg(int misc_reg, const MiscReg &val)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
+        return this->cpu->setMiscReg(misc_reg, val, this->threadNumber);
     }
 
     Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val)
     {
-        // Dummy function for now.
-        // @todo: Fix this once reg file gets fixed.
-        return NoFault;
+        return this->cpu->setMiscRegWithEffect(misc_reg, val,
+                                               this->threadNumber);
     }
 
 #if FULL_SYSTEM
+    /** Calls hardware return from error interrupt. */
     Fault hwrei();
+    /** Reads interrupt flag. */
     int readIntrFlag();
+    /** Sets interrupt flag. */
     void setIntrFlag(int val);
+    /** Checks if system is in PAL mode. */
     bool inPalMode();
+    /** Traps to handle specified fault. */
     void trap(Fault fault);
     bool simPalCheck(int palFunc);
 #else
+    /** Calls a syscall. */
     void syscall();
 #endif
 
@@ -237,16 +247,24 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
   public:
+    /** Calculates EA part of a memory instruction. Currently unused, though
+     * it may be useful in the future when memory instructions aren't
+     * executed with the EA calculation and the memory access being atomic.
+     */
     Fault calcEA()
     {
         return this->staticInst->eaCompInst()->execute(this, this->traceData);
     }
 
+    /** Does the memory access part of a memory instruction. Currently unused,
+     * though it may be useful in the future when memory instructions aren't
+     * executed with the EA calculation and the memory access being atomic.
+     */
     Fault memAccess()
     {
         return this->staticInst->memAccInst()->execute(this, this->traceData);
     }
 };
 
-#endif // __CPU_O3_CPU_ALPHA_DYN_INST_HH__
+#endif // __CPU_O3_ALPHA_DYN_INST_HH__
 
diff --git a/cpu/o3/alpha_dyn_inst_impl.hh b/cpu/o3/alpha_dyn_inst_impl.hh
index 96b7d3430..b5999f8d1 100644
--- a/cpu/o3/alpha_dyn_inst_impl.hh
+++ b/cpu/o3/alpha_dyn_inst_impl.hh
@@ -29,42 +29,88 @@
 #include "cpu/o3/alpha_dyn_inst.hh"
 
 template <class Impl>
-AlphaDynInst<Impl>::AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC,
+AlphaDynInst<Impl>::AlphaDynInst(ExtMachInst inst, Addr PC, Addr Pred_PC,
                                  InstSeqNum seq_num, FullCPU *cpu)
     : BaseDynInst<Impl>(inst, PC, Pred_PC, seq_num, cpu)
+{
+    initVars();
+}
+
+template <class Impl>
+AlphaDynInst<Impl>::AlphaDynInst(StaticInstPtr &_staticInst)
+    : BaseDynInst<Impl>(_staticInst)
+{
+    initVars();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::initVars()
 {
     // Make sure to have the renamed register entries set to the same
     // as the normal register entries.  It will allow the IQ to work
     // without any modifications.
-    for (int i = 0; i < this->staticInst->numDestRegs(); i++)
-    {
+    for (int i = 0; i < this->staticInst->numDestRegs(); i++) {
         _destRegIdx[i] = this->staticInst->destRegIdx(i);
     }
 
-    for (int i = 0; i < this->staticInst->numSrcRegs(); i++)
-    {
+    for (int i = 0; i < this->staticInst->numSrcRegs(); i++) {
         _srcRegIdx[i] = this->staticInst->srcRegIdx(i);
         this->_readySrcRegIdx[i] = 0;
     }
+}
 
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::execute()
+{
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->execute(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
 }
 
 template <class Impl>
-AlphaDynInst<Impl>::AlphaDynInst(StaticInstPtr &_staticInst)
-    : BaseDynInst<Impl>(_staticInst)
+Fault
+AlphaDynInst<Impl>::initiateAcc()
 {
-    // Make sure to have the renamed register entries set to the same
-    // as the normal register entries.  It will allow the IQ to work
-    // without any modifications.
-    for (int i = 0; i < _staticInst->numDestRegs(); i++)
-    {
-        _destRegIdx[i] = _staticInst->destRegIdx(i);
-    }
+    // @todo: Pretty convoluted way to avoid squashing from happening when using
+    // the XC during an instruction's execution (specifically for instructions
+    // that have sideeffects that use the XC).  Fix this.
+    bool in_syscall = this->thread->inSyscall;
+    this->thread->inSyscall = true;
+
+    this->fault = this->staticInst->initiateAcc(this, this->traceData);
+
+    this->thread->inSyscall = in_syscall;
+
+    return this->fault;
+}
 
-    for (int i = 0; i < _staticInst->numSrcRegs(); i++)
-    {
-        _srcRegIdx[i] = _staticInst->srcRegIdx(i);
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::completeAcc()
+{
+    if (this->isLoad()) {
+        this->fault = this->staticInst->completeAcc(this->req->data,
+                                                    this,
+                                                    this->traceData);
+    } else if (this->isStore()) {
+        this->fault = this->staticInst->completeAcc((uint8_t*)&this->req->result,
+                                                    this,
+                                                    this->traceData);
+    } else {
+        panic("Unknown type!");
     }
+
+    return this->fault;
 }
 
 #if FULL_SYSTEM
@@ -72,14 +118,28 @@ template <class Impl>
 Fault
 AlphaDynInst<Impl>::hwrei()
 {
-    return this->cpu->hwrei();
+    if (!this->cpu->inPalMode(this->readPC()))
+        return new AlphaISA::UnimplementedOpcodeFault;
+
+    this->setNextPC(this->cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR,
+                                           this->threadNumber));
+
+    this->cpu->kernelStats->hwrei();
+
+    // Tell CPU to clear any state it needs to if a hwrei is taken.
+    this->cpu->hwrei(this->threadNumber);
+
+    this->cpu->checkInterrupts = true;
+
+    // FIXME: XXX check for interrupts? XXX
+    return NoFault;
 }
 
 template <class Impl>
 int
 AlphaDynInst<Impl>::readIntrFlag()
 {
-return this->cpu->readIntrFlag();
+    return this->cpu->readIntrFlag();
 }
 
 template <class Impl>
@@ -93,14 +153,14 @@ template <class Impl>
 bool
 AlphaDynInst<Impl>::inPalMode()
 {
-    return this->cpu->inPalMode();
+    return this->cpu->inPalMode(this->PC);
 }
 
 template <class Impl>
 void
 AlphaDynInst<Impl>::trap(Fault fault)
 {
-    this->cpu->trap(fault);
+    this->cpu->trap(fault, this->threadNumber);
 }
 
 template <class Impl>
diff --git a/cpu/o3/alpha_impl.hh b/cpu/o3/alpha_impl.hh
index 5e39fcb37..f404bd3ec 100644
--- a/cpu/o3/alpha_impl.hh
+++ b/cpu/o3/alpha_impl.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_IMPL_HH__
-#define __CPU_O3_CPU_ALPHA_IMPL_HH__
+#ifndef __CPU_O3_ALPHA_IMPL_HH__
+#define __CPU_O3_ALPHA_IMPL_HH__
 
 #include "arch/alpha/isa_traits.hh"
 
@@ -41,7 +41,7 @@ class AlphaDynInst;
 template <class Impl>
 class AlphaFullCPU;
 
-/** Implementation specific struct that defines several key things to the
+/** Implementation specific struct that defines several key types to the
  *  CPU, the stages within the CPU, the time buffers, and the DynInst.
  *  The struct defines the ISA, the CPU policy, the specific DynInst, the
  *  specific FullCPU, and all of the structs from the time buffers to do
@@ -54,10 +54,10 @@ struct AlphaSimpleImpl
     /** The type of MachInst. */
     typedef TheISA::MachInst MachInst;
 
-    /** The CPU policy to be used (ie fetch, decode, etc.). */
+    /** The CPU policy to be used, which defines all of the CPU stages. */
     typedef SimpleCPUPolicy<AlphaSimpleImpl> CPUPol;
 
-    /** The DynInst to be used. */
+    /** The DynInst type to be used. */
     typedef AlphaDynInst<AlphaSimpleImpl> DynInst;
 
     /** The refcounted DynInst pointer to be used.  In most cases this is
@@ -65,15 +65,16 @@ struct AlphaSimpleImpl
      */
     typedef RefCountingPtr<DynInst> DynInstPtr;
 
-    /** The FullCPU to be used. */
+    /** The FullCPU type to be used. */
     typedef AlphaFullCPU<AlphaSimpleImpl> FullCPU;
 
     /** The Params to be passed to each stage. */
     typedef AlphaSimpleParams Params;
 
     enum {
-        MaxWidth = 8
+      MaxWidth = 8,
+      MaxThreads = 4
     };
 };
 
-#endif // __CPU_O3_CPU_ALPHA_IMPL_HH__
+#endif // __CPU_O3_ALPHA_IMPL_HH__
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index 79b0937e3..04b790815 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -26,18 +26,19 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ALPHA_SIMPLE_PARAMS_HH__
-#define __CPU_O3_CPU_ALPHA_SIMPLE_PARAMS_HH__
+#ifndef __CPU_O3_ALPHA_PARAMS_HH__
+#define __CPU_O3_ALPHA_PARAMS_HH__
 
 #include "cpu/o3/cpu.hh"
 
 //Forward declarations
-class System;
-class AlphaITB;
 class AlphaDTB;
+class AlphaITB;
+class FUPool;
 class FunctionalMemory;
-class Process;
 class MemInterface;
+class Process;
+class System;
 
 /**
  * This file defines the parameters that will be used for the AlphaFullCPU.
@@ -56,6 +57,9 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     Process *process;
 #endif // FULL_SYSTEM
 
+    //Page Table
+//    PageTable *pTable;
+
     FunctionalMemory *mem;
 
     //
@@ -64,6 +68,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     MemInterface *icacheInterface;
     MemInterface *dcacheInterface;
 
+    unsigned cachePorts;
+
     //
     // Fetch
     //
@@ -102,6 +108,7 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned executeFloatWidth;
     unsigned executeBranchWidth;
     unsigned executeMemoryWidth;
+    FUPool *fuPool;
 
     //
     // Commit
@@ -114,20 +121,15 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     //
     // Branch predictor (BP & BTB)
     //
-/*
     unsigned localPredictorSize;
-    unsigned localPredictorCtrBits;
-*/
-
-    unsigned local_predictor_size;
-    unsigned local_ctr_bits;
-    unsigned local_history_table_size;
-    unsigned local_history_bits;
-    unsigned global_predictor_size;
-    unsigned global_ctr_bits;
-    unsigned global_history_bits;
-    unsigned choice_predictor_size;
-    unsigned choice_ctr_bits;
+    unsigned localCtrBits;
+    unsigned localHistoryTableSize;
+    unsigned localHistoryBits;
+    unsigned globalPredictorSize;
+    unsigned globalCtrBits;
+    unsigned globalHistoryBits;
+    unsigned choicePredictorSize;
+    unsigned choiceCtrBits;
 
     unsigned BTBEntries;
     unsigned BTBTagSize;
@@ -154,10 +156,24 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned numIQEntries;
     unsigned numROBEntries;
 
+    //SMT Parameters
+    unsigned smtNumFetchingThreads;
+
+    std::string   smtFetchPolicy;
+
+    std::string   smtIQPolicy;
+    unsigned smtIQThreshold;
+
+    std::string   smtLSQPolicy;
+    unsigned smtLSQThreshold;
+
+    std::string   smtCommitPolicy;
+
+    std::string   smtROBPolicy;
+    unsigned smtROBThreshold;
+
     // Probably can get this from somewhere.
     unsigned instShiftAmt;
-
-    bool defReg;
 };
 
-#endif // __CPU_O3_CPU_ALPHA_PARAMS_HH__
+#endif // __CPU_O3_ALPHA_PARAMS_HH__
diff --git a/cpu/o3/bpred_unit.cc b/cpu/o3/bpred_unit.cc
index 85bd6f0a6..a78dcf463 100644
--- a/cpu/o3/bpred_unit.cc
+++ b/cpu/o3/bpred_unit.cc
@@ -29,5 +29,9 @@
 #include "cpu/o3/bpred_unit_impl.hh"
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/ozone/ozone_impl.hh"
+#include "cpu/ozone/simple_impl.hh"
 
 template class TwobitBPredUnit<AlphaSimpleImpl>;
+template class TwobitBPredUnit<OzoneImpl>;
+template class TwobitBPredUnit<SimpleImpl>;
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index 2725684f7..67c300989 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BPRED_UNIT_HH__
-#define __BPRED_UNIT_HH__
+#ifndef __CPU_O3_BPRED_UNIT_HH__
+#define __CPU_O3_BPRED_UNIT_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
@@ -35,9 +35,9 @@
 #include "cpu/inst_seq.hh"
 
 #include "cpu/o3/2bit_local_pred.hh"
-#include "cpu/o3/tournament_pred.hh"
 #include "cpu/o3/btb.hh"
 #include "cpu/o3/ras.hh"
+#include "cpu/o3/tournament_pred.hh"
 
 #include <list>
 
@@ -57,77 +57,171 @@ class TwobitBPredUnit
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-    TwobitBPredUnit(Params &params);
+    /**
+     * @param params The params object, that has the size of the BP and BTB.
+     */
+    TwobitBPredUnit(Params *params);
 
+    /**
+     * Registers statistics.
+     */
     void regStats();
 
-    bool predict(DynInstPtr &inst, Addr &PC);
-
-    void update(const InstSeqNum &done_sn);
-
-    void squash(const InstSeqNum &squashed_sn);
-
+    /**
+     * Predicts whether or not the instruction is a taken branch, and the
+     * target of the branch if it is taken.
+     * @param inst The branch instruction.
+     * @param PC The predicted PC is passed back through this parameter.
+     * @param tid The thread id.
+     * @return Returns if the branch is taken or not.
+     */
+    bool predict(DynInstPtr &inst, Addr &PC, unsigned tid);
+
+    /**
+     * Tells the branch predictor to commit any updates until the given
+     * sequence number.
+     * @param done_sn The sequence number to commit any older updates up until.
+     * @param tid The thread id.
+     */
+    void update(const InstSeqNum &done_sn, unsigned tid);
+
+    /**
+     * Squashes all outstanding updates until a given sequence number.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param tid The thread id.
+     */
+    void squash(const InstSeqNum &squashed_sn, unsigned tid);
+
+    /**
+     * Squashes all outstanding updates until a given sequence number, and
+     * corrects that sn's update with the proper address and taken/not taken.
+     * @param squashed_sn The sequence number to squash any younger updates up
+     * until.
+     * @param corr_target The correct branch target.
+     * @param actually_taken The correct branch direction.
+     * @param tid The thread id.
+     */
     void squash(const InstSeqNum &squashed_sn, const Addr &corr_target,
-                bool actually_taken);
+                bool actually_taken, unsigned tid);
 
+    /**
+     * Looks up a given PC in the BP to see if it is taken or not taken.
+     * @param inst_PC The PC to look up.
+     * @return Whether the branch is taken or not taken.
+     */
     bool BPLookup(Addr &inst_PC)
     { return BP.lookup(inst_PC); }
 
+    /**
+     * Looks up a given PC in the BTB to see if a matching entry exists.
+     * @param inst_PC The PC to look up.
+     * @return Whether the BTB contains the given PC.
+     */
     bool BTBValid(Addr &inst_PC)
-    { return BTB.valid(inst_PC); }
+    { return BTB.valid(inst_PC, 0); }
 
+    /**
+     * Looks up a given PC in the BTB to get the predicted target.
+     * @param inst_PC The PC to look up.
+     * @return The address of the target of the branch.
+     */
     Addr BTBLookup(Addr &inst_PC)
-    { return BTB.lookup(inst_PC); }
-
-    // Will want to include global history.
+    { return BTB.lookup(inst_PC, 0); }
+
+    /**
+     * Updates the BP with taken/not taken information.
+     * @param inst_PC The branch's PC that will be updated.
+     * @param taken Whether the branch was taken or not taken.
+     * @todo Make this update flexible enough to handle a global predictor.
+     */
     void BPUpdate(Addr &inst_PC, bool taken)
     { BP.update(inst_PC, taken); }
 
+    /**
+     * Updates the BTB with the target of a branch.
+     * @param inst_PC The branch's PC that will be updated.
+     * @param target_PC The branch's target that will be added to the BTB.
+     */
     void BTBUpdate(Addr &inst_PC, Addr &target_PC)
-    { BTB.update(inst_PC, target_PC); }
+    { BTB.update(inst_PC, target_PC,0); }
 
   private:
     struct PredictorHistory {
+        /**
+         * Makes a predictor history struct that contains a sequence number,
+         * the PC of its instruction, and whether or not it was predicted
+         * taken.
+         */
         PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC,
-                         const bool pred_taken)
-            : seqNum(seq_num), PC(inst_PC), predTaken(pred_taken),
-              globalHistory(0), usedRAS(0), wasCall(0), RASIndex(0),
-              RASTarget(0)
+                         const bool pred_taken, const unsigned _tid)
+            : seqNum(seq_num), PC(inst_PC), RASTarget(0), globalHistory(0),
+              RASIndex(0), tid(_tid), predTaken(pred_taken), usedRAS(0),
+              wasCall(0)
         { }
 
+        /** The sequence number for the predictor history entry. */
         InstSeqNum seqNum;
 
+        /** The PC associated with the sequence number. */
         Addr PC;
 
-        bool predTaken;
+        /** The RAS target (only valid if a return). */
+        Addr RASTarget;
 
+        /** The global history at the time this entry was created. */
         unsigned globalHistory;
 
-        bool usedRAS;
+        /** The RAS index of the instruction (only valid if a call). */
+        unsigned RASIndex;
 
-        bool wasCall;
+        /** The thread id. */
+        unsigned tid;
 
-        unsigned RASIndex;
+        /** Whether or not it was predicted taken. */
+        bool predTaken;
 
-        Addr RASTarget;
+        /** Whether or not the RAS was used. */
+        bool usedRAS;
+
+        /** Whether or not the instruction was a call. */
+        bool wasCall;
     };
 
-    std::list<PredictorHistory> predHist;
+    typedef std::list<PredictorHistory> History;
+
+    /**
+     * The per-thread predictor history. This is used to update the predictor
+     * as instructions are committed, or restore it to the proper state after
+     * a squash.
+     */
+    History predHist[Impl::MaxThreads];
 
+    /** The branch predictor. */
     DefaultBP BP;
 
+    /** The BTB. */
     DefaultBTB BTB;
 
-    ReturnAddrStack RAS;
+    /** The per-thread return address stack. */
+    ReturnAddrStack RAS[Impl::MaxThreads];
 
+    /** Stat for number of BP lookups. */
     Stats::Scalar<> lookups;
+    /** Stat for number of conditional branches predicted. */
     Stats::Scalar<> condPredicted;
+    /** Stat for number of conditional branches predicted incorrectly. */
     Stats::Scalar<> condIncorrect;
+    /** Stat for number of BTB lookups. */
     Stats::Scalar<> BTBLookups;
+    /** Stat for number of BTB hits. */
     Stats::Scalar<> BTBHits;
+    /** Stat for number of times the BTB is correct. */
     Stats::Scalar<> BTBCorrect;
+    /** Stat for number of times the RAS is used to get a target. */
     Stats::Scalar<> usedRAS;
+    /** Stat for number of times the RAS is incorrect. */
     Stats::Scalar<> RASIncorrect;
 };
 
-#endif // __BPRED_UNIT_HH__
+#endif // __CPU_O3_BPRED_UNIT_HH__
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index 8d16a0cdf..f79b67b6c 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -30,16 +30,22 @@
 #include "base/traceflags.hh"
 #include "cpu/o3/bpred_unit.hh"
 
+#include <vector>
+#include <list>
+
+using namespace std;
+
 template<class Impl>
-TwobitBPredUnit<Impl>::TwobitBPredUnit(Params &params)
-  : BP(params.local_predictor_size,
-       params.local_ctr_bits,
-       params.instShiftAmt),
-    BTB(params.BTBEntries,
-        params.BTBTagSize,
-        params.instShiftAmt),
-    RAS(params.RASSize)
+TwobitBPredUnit<Impl>::TwobitBPredUnit(Params *params)
+  : BP(params->localPredictorSize,
+       params->localCtrBits,
+       params->instShiftAmt),
+    BTB(params->BTBEntries,
+        params->BTBTagSize,
+        params->instShiftAmt)
 {
+    for (int i=0; i < Impl::MaxThreads; i++)
+        RAS[i].init(params->RASSize);
 }
 
 template <class Impl>
@@ -79,7 +85,7 @@ TwobitBPredUnit<Impl>::regStats()
 
     usedRAS
         .name(name() + ".BPredUnit.usedRAS")
-        .desc("Number of times the RAS was used.")
+        .desc("Number of times the RAS was used to get a target.")
         ;
 
     RASIncorrect
@@ -90,7 +96,7 @@ TwobitBPredUnit<Impl>::regStats()
 
 template <class Impl>
 bool
-TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
+TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid)
 {
     // See if branch predictor predicts taken.
     // If so, get its target addr either from the BTB or the RAS.
@@ -106,18 +112,19 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
     ++lookups;
 
     if (inst->isUncondCtrl()) {
-        DPRINTF(Fetch, "BranchPred: Unconditional control.\n");
+        DPRINTF(Fetch, "BranchPred: [tid:%i] Unconditional control.\n", tid);
         pred_taken = true;
     } else {
         ++condPredicted;
 
         pred_taken = BPLookup(PC);
 
-        DPRINTF(Fetch, "BranchPred: Branch predictor predicted %i for PC %#x"
-                "\n", pred_taken, inst->readPC());
+        DPRINTF(Fetch, "BranchPred: [tid:%i]: Branch predictor predicted %i "
+                "for PC %#x\n",
+                tid, pred_taken, inst->readPC());
     }
 
-    PredictorHistory predict_record(inst->seqNum, PC, pred_taken);
+    PredictorHistory predict_record(inst->seqNum, PC, pred_taken, tid);
 
     // Now lookup in the BTB or RAS.
     if (pred_taken) {
@@ -126,45 +133,48 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
 
             // If it's a function return call, then look up the address
             // in the RAS.
-            target = RAS.top();
+            target = RAS[tid].top();
 
             // Record the top entry of the RAS, and its index.
             predict_record.usedRAS = true;
-            predict_record.RASIndex = RAS.topIdx();
+            predict_record.RASIndex = RAS[tid].topIdx();
             predict_record.RASTarget = target;
 
-            RAS.pop();
+            assert(predict_record.RASIndex < 16);
 
-            DPRINTF(Fetch, "BranchPred: Instruction %#x is a return, RAS "
-                    "predicted target: %#x, RAS index: %i.\n",
-                    inst->readPC(), target, predict_record.RASIndex);
+            RAS[tid].pop();
+
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Instruction %#x is a return, "
+                    "RAS predicted target: %#x, RAS index: %i.\n",
+                    tid, inst->readPC(), target, predict_record.RASIndex);
         } else {
             ++BTBLookups;
 
             if (inst->isCall()) {
-                RAS.push(PC+sizeof(MachInst));
+                RAS[tid].push(PC + sizeof(MachInst));
 
                 // Record that it was a call so that the top RAS entry can
                 // be popped off if the speculation is incorrect.
                 predict_record.wasCall = true;
 
-                DPRINTF(Fetch, "BranchPred: Instruction %#x was a call, "
-                        "adding %#x to the RAS.\n",
-                        inst->readPC(), PC+sizeof(MachInst));
+                DPRINTF(Fetch, "BranchPred: [tid:%i] Instruction %#x was a call"
+                        ", adding %#x to the RAS.\n",
+                        tid, inst->readPC(), PC + sizeof(MachInst));
             }
 
-            if (BTB.valid(PC)) {
+            if (BTB.valid(PC, tid)) {
                 ++BTBHits;
 
                 //If it's anything else, use the BTB to get the target addr.
-                target = BTB.lookup(PC);
+                target = BTB.lookup(PC, tid);
 
-                DPRINTF(Fetch, "BranchPred: Instruction %#x predicted target "
-                        "is %#x.\n", inst->readPC(), target);
+                DPRINTF(Fetch, "BranchPred: [tid:%i]: Instruction %#x predicted"
+                        " target is %#x.\n",
+                        tid, inst->readPC(), target);
 
             } else {
-                DPRINTF(Fetch, "BranchPred: BTB doesn't have a valid entry."
-                        "\n");
+                DPRINTF(Fetch, "BranchPred: [tid:%i]: BTB doesn't have a "
+                        "valid entry.\n",tid);
                 pred_taken = false;
             }
 
@@ -180,97 +190,112 @@ TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
         inst->setPredTarg(PC);
     }
 
-    predHist.push_front(predict_record);
+    predHist[tid].push_front(predict_record);
 
-    assert(!predHist.empty());
+    DPRINTF(Fetch, "[tid:%i] predHist.size(): %i\n", tid, predHist[tid].size());
 
     return pred_taken;
 }
 
 template <class Impl>
 void
-TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn)
+TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn, unsigned tid)
 {
-    DPRINTF(Fetch, "BranchPred: Commiting branches until sequence number "
-            "%i.\n", done_sn);
-
-    while (!predHist.empty() && predHist.back().seqNum <= done_sn) {
-        assert(!predHist.empty());
+    DPRINTF(Fetch, "BranchPred: [tid:%i]: Commiting branches until sequence"
+            "number %lli.\n", tid, done_sn);
 
-        // Update the branch predictor with the correct results of branches.
-        BP.update(predHist.back().PC, predHist.back().predTaken);
+    while (!predHist[tid].empty() &&
+           predHist[tid].back().seqNum <= done_sn) {
+        // Update the branch predictor with the correct results.
+        BP.update(predHist[tid].back().PC,
+                  predHist[tid].back().predTaken);
 
-        predHist.pop_back();
+        predHist[tid].pop_back();
     }
 }
 
 template <class Impl>
 void
-TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn)
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn, unsigned tid)
 {
-    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
-        if (predHist.front().usedRAS) {
-            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
-                    "target: %#x.\n",
-                    predHist.front().RASIndex,
-                    predHist.front().RASTarget);
+    History &pred_hist = predHist[tid];
+
+    while (!pred_hist.empty() &&
+           pred_hist.front().seqNum > squashed_sn) {
+       if (pred_hist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i,"
+                    " target: %#x.\n",
+                    tid,
+                    pred_hist.front().RASIndex,
+                    pred_hist.front().RASTarget);
+
+            RAS[tid].restore(pred_hist.front().RASIndex,
+                             pred_hist.front().RASTarget);
 
-            RAS.restore(predHist.front().RASIndex,
-                        predHist.front().RASTarget);
-        } else if (predHist.front().wasCall) {
-            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
-                    "to the RAS.\n");
+        } else if (pred_hist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry added "
+                    "to the RAS.\n",tid);
 
-            RAS.pop();
+            RAS[tid].pop();
         }
 
-        predHist.pop_front();
+        pred_hist.pop_front();
     }
+
 }
 
 template <class Impl>
 void
 TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
                               const Addr &corr_target,
-                              const bool actually_taken)
+                              const bool actually_taken,
+                              unsigned tid)
 {
     // Now that we know that a branch was mispredicted, we need to undo
     // all the branches that have been seen up until this branch and
     // fix up everything.
 
+    History &pred_hist = predHist[tid];
+
     ++condIncorrect;
 
-    DPRINTF(Fetch, "BranchPred: Squashing from sequence number %i, "
+    DPRINTF(Fetch, "BranchPred: [tid:%i]: Squashing from sequence number %i, "
             "setting target to %#x.\n",
-            squashed_sn, corr_target);
-
-    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
+            tid, squashed_sn, corr_target);
 
-        if (predHist.front().usedRAS) {
-            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+    while (!pred_hist.empty() &&
+           pred_hist.front().seqNum > squashed_sn) {
+        if (pred_hist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Restoring top of RAS to: %i, "
                     "target: %#x.\n",
-                    predHist.front().RASIndex,
-                    predHist.front().RASTarget);
+                    tid,
+                    pred_hist.front().RASIndex,
+                    pred_hist.front().RASTarget);
 
-            RAS.restore(predHist.front().RASIndex,
-                        predHist.front().RASTarget);
-        } else if (predHist.front().wasCall) {
-            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
-                    "to the RAS.\n");
+            RAS[tid].restore(pred_hist.front().RASIndex,
+                             pred_hist.front().RASTarget);
+        } else if (pred_hist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: [tid:%i]: Removing speculative entry"
+                    " added to the RAS.\n", tid);
 
-            RAS.pop();
+            RAS[tid].pop();
         }
 
-        predHist.pop_front();
+        pred_hist.pop_front();
     }
 
-    predHist.front().predTaken = actually_taken;
+    // If there's a squash due to a syscall, there may not be an entry
+    // corresponding to the squash.  In that case, don't bother trying to
+    // fix up the entry.
+    if (!pred_hist.empty()) {
+        pred_hist.front().predTaken = actually_taken;
 
-    if (predHist.front().usedRAS) {
-        ++RASIncorrect;
-    }
+        if (pred_hist.front().usedRAS) {
+            ++RASIncorrect;
+        }
 
-    BP.update(predHist.front().PC, actually_taken);
+        BP.update(pred_hist.front().PC, actually_taken);
 
-    BTB.update(predHist.front().PC, corr_target);
+        BTB.update(pred_hist.front().PC, corr_target, tid);
+    }
 }
diff --git a/cpu/o3/btb.cc b/cpu/o3/btb.cc
index 2d39c3856..e084142d7 100644
--- a/cpu/o3/btb.cc
+++ b/cpu/o3/btb.cc
@@ -39,14 +39,15 @@ DefaultBTB::DefaultBTB(unsigned _numEntries,
       tagBits(_tagBits),
       instShiftAmt(_instShiftAmt)
 {
-    // @todo Check to make sure num_entries is valid (a power of 2)
-
     DPRINTF(Fetch, "BTB: Creating BTB object.\n");
 
-    btb = new BTBEntry[numEntries];
+    if (!isPowerOf2(numEntries)) {
+        fatal("BTB entries is not a power of 2!");
+    }
+
+    btb.resize(numEntries);
 
-    for (int i = 0; i < numEntries; ++i)
-    {
+    for (int i = 0; i < numEntries; ++i) {
         btb[i].valid = false;
     }
 
@@ -73,7 +74,7 @@ DefaultBTB::getTag(const Addr &inst_PC)
 }
 
 bool
-DefaultBTB::valid(const Addr &inst_PC)
+DefaultBTB::valid(const Addr &inst_PC, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
@@ -81,7 +82,9 @@ DefaultBTB::valid(const Addr &inst_PC)
 
     assert(btb_idx < numEntries);
 
-    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+    if (btb[btb_idx].valid
+        && inst_tag == btb[btb_idx].tag
+        && btb[btb_idx].tid == tid) {
         return true;
     } else {
         return false;
@@ -92,7 +95,7 @@ DefaultBTB::valid(const Addr &inst_PC)
 // address is valid, and also the address.  For now will just use addr = 0 to
 // represent invalid entry.
 Addr
-DefaultBTB::lookup(const Addr &inst_PC)
+DefaultBTB::lookup(const Addr &inst_PC, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
@@ -100,7 +103,9 @@ DefaultBTB::lookup(const Addr &inst_PC)
 
     assert(btb_idx < numEntries);
 
-    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+    if (btb[btb_idx].valid
+        && inst_tag == btb[btb_idx].tag
+        && btb[btb_idx].tid == tid) {
         return btb[btb_idx].target;
     } else {
         return 0;
@@ -108,12 +113,13 @@ DefaultBTB::lookup(const Addr &inst_PC)
 }
 
 void
-DefaultBTB::update(const Addr &inst_PC, const Addr &target)
+DefaultBTB::update(const Addr &inst_PC, const Addr &target, unsigned tid)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
     assert(btb_idx < numEntries);
 
+    btb[btb_idx].tid = tid;
     btb[btb_idx].valid = true;
     btb[btb_idx].target = target;
     btb[btb_idx].tag = getTag(inst_PC);
diff --git a/cpu/o3/btb.hh b/cpu/o3/btb.hh
index 77bdc32ea..aaa9945f7 100644
--- a/cpu/o3/btb.hh
+++ b/cpu/o3/btb.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_BTB_HH__
-#define __CPU_O3_CPU_BTB_HH__
+#ifndef __CPU_O3_BTB_HH__
+#define __CPU_O3_BTB_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
@@ -42,39 +42,84 @@ class DefaultBTB
         {
         }
 
+        /** The entry's tag. */
         Addr tag;
+
+        /** The entry's target. */
         Addr target;
+
+        /** The entry's thread id. */
+        unsigned tid;
+
+        /** Whether or not the entry is valid. */
         bool valid;
     };
 
   public:
+    /** Creates a BTB with the given number of entries, number of bits per
+     *  tag, and instruction offset amount.
+     *  @param numEntries Number of entries for the BTB.
+     *  @param tagBits Number of bits for each tag in the BTB.
+     *  @param instShiftAmt Offset amount for instructions to ignore alignment.
+     */
     DefaultBTB(unsigned numEntries, unsigned tagBits,
                unsigned instShiftAmt);
 
-    Addr lookup(const Addr &inst_PC);
-
-    bool valid(const Addr &inst_PC);
-
-    void update(const Addr &inst_PC, const Addr &target_PC);
+    /** Looks up an address in the BTB. Must call valid() first on the address.
+     *  @param inst_PC The address of the branch to look up.
+     *  @param tid The thread id.
+     *  @return Returns the target of the branch.
+     */
+    Addr lookup(const Addr &inst_PC, unsigned tid);
+
+    /** Checks if a branch is in the BTB.
+     *  @param inst_PC The address of the branch to look up.
+     *  @param tid The thread id.
+     *  @return Whether or not the branch exists in the BTB.
+     */
+    bool valid(const Addr &inst_PC, unsigned tid);
+
+    /** Updates the BTB with the target of a branch.
+     *  @param inst_PC The address of the branch being updated.
+     *  @param target_PC The target address of the branch.
+     *  @param tid The thread id.
+     */
+    void update(const Addr &inst_PC, const Addr &target_PC,
+                unsigned tid);
 
   private:
+    /** Returns the index into the BTB, based on the branch's PC.
+     *  @param inst_PC The branch to look up.
+     *  @return Returns the index into the BTB.
+     */
     inline unsigned getIndex(const Addr &inst_PC);
 
+    /** Returns the tag bits of a given address.
+     *  @param inst_PC The branch's address.
+     *  @return Returns the tag bits.
+     */
     inline Addr getTag(const Addr &inst_PC);
 
-    BTBEntry *btb;
+    /** The actual BTB. */
+    std::vector<BTBEntry> btb;
 
+    /** The number of entries in the BTB. */
     unsigned numEntries;
 
+    /** The index mask. */
     unsigned idxMask;
 
+    /** The number of tag bits per entry. */
     unsigned tagBits;
 
+    /** The tag mask. */
     unsigned tagMask;
 
+    /** Number of bits to shift PC when calculating index. */
     unsigned instShiftAmt;
 
+    /** Number of bits to shift PC when calculating tag. */
     unsigned tagShiftAmt;
 };
 
-#endif // __CPU_O3_CPU_BTB_HH__
+#endif // __CPU_O3_BTB_HH__
diff --git a/cpu/o3/comm.hh b/cpu/o3/comm.hh
index c74c77ddf..1a8f394ca 100644
--- a/cpu/o3/comm.hh
+++ b/cpu/o3/comm.hh
@@ -26,30 +26,35 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_COMM_HH__
-#define __CPU_O3_CPU_COMM_HH__
+#ifndef __CPU_O3_COMM_HH__
+#define __CPU_O3_COMM_HH__
 
 #include <vector>
 
+#include "arch/faults.hh"
 #include "arch/isa_traits.hh"
 #include "cpu/inst_seq.hh"
 #include "sim/host.hh"
 
-// Find better place to put this typedef.
-// The impl might be the best place for this.
+// Typedef for physical register index type. Although the Impl would be the
+// most likely location for this, there are a few classes that need this
+// typedef yet are not templated on the Impl. For now it will be defined here.
 typedef short int PhysRegIndex;
 
 template<class Impl>
-struct SimpleFetchSimpleDecode {
+struct DefaultFetchDefaultDecode {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
 
     DynInstPtr insts[Impl::MaxWidth];
+    Fault fetchFault;
+    InstSeqNum fetchFaultSN;
+    bool clearFetchFault;
 };
 
 template<class Impl>
-struct SimpleDecodeSimpleRename {
+struct DefaultDecodeDefaultRename {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
@@ -58,7 +63,7 @@ struct SimpleDecodeSimpleRename {
 };
 
 template<class Impl>
-struct SimpleRenameSimpleIEW {
+struct DefaultRenameDefaultIEW {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
@@ -67,19 +72,21 @@ struct SimpleRenameSimpleIEW {
 };
 
 template<class Impl>
-struct SimpleIEWSimpleCommit {
+struct DefaultIEWDefaultCommit {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
     int size;
 
     DynInstPtr insts[Impl::MaxWidth];
 
-    bool squash;
-    bool branchMispredict;
-    bool branchTaken;
-    uint64_t mispredPC;
-    uint64_t nextPC;
-    InstSeqNum squashedSeqNum;
+    bool squash[Impl::MaxThreads];
+    bool branchMispredict[Impl::MaxThreads];
+    bool branchTaken[Impl::MaxThreads];
+    uint64_t mispredPC[Impl::MaxThreads];
+    uint64_t nextPC[Impl::MaxThreads];
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
+
+    bool includeSquashInst[Impl::MaxThreads];
 };
 
 template<class Impl>
@@ -91,63 +98,77 @@ struct IssueStruct {
     DynInstPtr insts[Impl::MaxWidth];
 };
 
+template<class Impl>
 struct TimeBufStruct {
     struct decodeComm {
         bool squash;
-        bool stall;
         bool predIncorrect;
         uint64_t branchAddr;
 
         InstSeqNum doneSeqNum;
 
-        // Might want to package this kind of branch stuff into a single
+        // @todo: Might want to package this kind of branch stuff into a single
         // struct as it is used pretty frequently.
         bool branchMispredict;
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
+
+        unsigned branchCount;
     };
 
-    decodeComm decodeInfo;
+    decodeComm decodeInfo[Impl::MaxThreads];
 
     // Rename can't actually tell anything to squash or send a new PC back
     // because it doesn't do anything along those lines.  But maybe leave
     // these fields in here to keep the stages mostly orthagonal.
     struct renameComm {
         bool squash;
-        bool stall;
 
         uint64_t nextPC;
     };
 
-    renameComm renameInfo;
+    renameComm renameInfo[Impl::MaxThreads];
 
     struct iewComm {
-        bool stall;
-
         // Also eventually include skid buffer space.
+        bool usedIQ;
         unsigned freeIQEntries;
+        bool usedLSQ;
+        unsigned freeLSQEntries;
+
+        unsigned iqCount;
+        unsigned ldstqCount;
+
+        unsigned dispatched;
+        unsigned dispatchedToLSQ;
     };
 
-    iewComm iewInfo;
+    iewComm iewInfo[Impl::MaxThreads];
 
     struct commitComm {
-        bool squash;
-        bool stall;
+        bool usedROB;
         unsigned freeROBEntries;
+        bool emptyROB;
+
+        bool squash;
+        bool robSquashing;
 
         bool branchMispredict;
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
 
-        bool robSquashing;
-
         // Represents the instruction that has either been retired or
         // squashed.  Similar to having a single bus that broadcasts the
         // retired or squashed sequence number.
         InstSeqNum doneSeqNum;
 
+        //Just in case we want to do a commit/squash on a cycle
+        //(necessary for multiple ROBs?)
+        bool commitInsts;
+        InstSeqNum squashSeqNum;
+
         // Extra bit of information so that the LDSTQ only updates when it
         // needs to.
         bool commitIsLoad;
@@ -155,9 +176,26 @@ struct TimeBufStruct {
         // Communication specifically to the IQ to tell the IQ that it can
         // schedule a non-speculative instruction.
         InstSeqNum nonSpecSeqNum;
+
+        // Hack for now to send back an uncached access to the IEW stage.
+        typedef typename Impl::DynInstPtr DynInstPtr;
+        bool uncached;
+        DynInstPtr uncachedLoad;
+
+        bool interruptPending;
+        bool clearInterrupt;
     };
 
-    commitComm commitInfo;
+    commitComm commitInfo[Impl::MaxThreads];
+
+    bool decodeBlock[Impl::MaxThreads];
+    bool decodeUnblock[Impl::MaxThreads];
+    bool renameBlock[Impl::MaxThreads];
+    bool renameUnblock[Impl::MaxThreads];
+    bool iewBlock[Impl::MaxThreads];
+    bool iewUnblock[Impl::MaxThreads];
+    bool commitBlock[Impl::MaxThreads];
+    bool commitUnblock[Impl::MaxThreads];
 };
 
-#endif //__CPU_O3_CPU_COMM_HH__
+#endif //__CPU_O3_COMM_HH__
diff --git a/cpu/o3/commit.cc b/cpu/o3/commit.cc
index cf33d7f8b..fe5e9c1de 100644
--- a/cpu/o3/commit.cc
+++ b/cpu/o3/commit.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/commit_impl.hh"
 
-template class SimpleCommit<AlphaSimpleImpl>;
+template class DefaultCommit<AlphaSimpleImpl>;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 580c1a316..93b74ebb0 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -26,29 +26,42 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Maybe have a special method for handling interrupts/traps.
-//
-// Traps:  Have IEW send a signal to commit saying that there's a trap to
-// be handled.  Have commit send the PC back to the fetch stage, along
-// with the current commit PC.  Fetch will directly access the IPR and save
-// off all the proper stuff.  Commit can send out a squash, or something
-// close to it.
-// Do the same for hwrei().  However, requires that commit be specifically
-// built to support that kind of stuff.  Probably not horrible to have
-// commit support having the CPU tell it to squash the other stages and
-// restart at a given address.  The IPR register does become an issue.
-// Probably not a big deal if the IPR stuff isn't cycle accurate.  Can just
-// have the original function handle writing to the IPR register.
-
-#ifndef __CPU_O3_CPU_SIMPLE_COMMIT_HH__
-#define __CPU_O3_CPU_SIMPLE_COMMIT_HH__
+#ifndef __CPU_O3_COMMIT_HH__
+#define __CPU_O3_COMMIT_HH__
 
+#include "arch/faults.hh"
+#include "cpu/inst_seq.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/exetrace.hh"
 #include "mem/memory_interface.hh"
 
+template <class>
+class O3ThreadState;
+
+/**
+ * DefaultCommit handles single threaded and SMT commit. Its width is specified
+ * by the parameters; each cycle it tries to commit that many instructions. The
+ * SMT policy decides which thread it tries to commit instructions from. Non-
+ * speculative instructions must reach the head of the ROB before they are
+ * ready to execute; once they reach the head, commit will broadcast the
+ * instruction's sequence number to the previous stages so that they can issue/
+ * execute the instruction. Only one non-speculative instruction is handled per
+ * cycle. Commit is responsible for handling all back-end initiated redirects.
+ * It receives the redirect, and then broadcasts it to all stages, indicating
+ * the sequence number they should squash until, and any necessary branch mis-
+ * prediction information as well. It priortizes redirects by instruction's age,
+ * only broadcasting a redirect if it corresponds to an instruction that should
+ * currently be in the ROB. This is done by tracking the sequence number of the
+ * youngest instruction in the ROB, which gets updated to any squashing
+ * instruction's sequence number, and only broadcasting a redirect if it
+ * corresponds to an older instruction. Commit also supports multiple cycle
+ * squashing, to model a ROB that can only remove a certain number of
+ * instructions per cycle. Eventually traps and interrupts will most likely
+ * be handled here as well.
+ */
 template<class Impl>
-class SimpleCommit
+class DefaultCommit
 {
   public:
     // Typedefs from the Impl.
@@ -57,62 +70,191 @@ class SimpleCommit
     typedef typename Impl::Params Params;
     typedef typename Impl::CPUPol CPUPol;
 
+    typedef typename CPUPol::RenameMap RenameMap;
     typedef typename CPUPol::ROB ROB;
 
     typedef typename CPUPol::TimeStruct TimeStruct;
+    typedef typename CPUPol::FetchStruct FetchStruct;
     typedef typename CPUPol::IEWStruct IEWStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
 
-  public:
-    // I don't believe commit can block, so it will only have two
-    // statuses for now.
-    // Actually if there's a cache access that needs to block (ie
-    // uncachable load or just a mem access in commit) then the stage
-    // may have to wait.
-    enum Status {
+    typedef typename CPUPol::IEW IEW;
+
+    typedef O3ThreadState<Impl> Thread;
+
+    class TrapEvent : public Event {
+      private:
+        DefaultCommit<Impl> *commit;
+        unsigned tid;
+
+      public:
+        TrapEvent(DefaultCommit<Impl> *_commit, unsigned _tid);
+
+        void process();
+        const char *description();
+    };
+
+    /** Overall commit status. Used to determine if the CPU can deschedule
+     * itself due to a lack of activity.
+     */
+    enum CommitStatus{
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
         ROBSquashing,
-        DcacheMissStall,
-        DcacheMissComplete
+        TrapPending,
+        FetchTrapPending
+    };
+
+    /** Commit policy for SMT mode. */
+    enum CommitPolicy {
+        Aggressive,
+        RoundRobin,
+        OldestReady
     };
 
   private:
-    Status _status;
+    /** Overall commit status. */
+    CommitStatus _status;
+    /** Next commit status, to be set at the end of the cycle. */
+    CommitStatus _nextStatus;
+    /** Per-thread status. */
+    ThreadStatus commitStatus[Impl::MaxThreads];
+    /** Commit policy used in SMT mode. */
+    CommitPolicy commitPolicy;
 
   public:
-    SimpleCommit(Params &params);
+    /** Construct a DefaultCommit with the given parameters. */
+    DefaultCommit(Params *params);
+
+    /** Returns the name of the DefaultCommit. */
+    std::string name() const;
 
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the list of threads. */
+    void setThreads(std::vector<Thread *> &threads);
+
+    /** Sets the main time buffer pointer, used for backwards communication. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
+
+    /** Sets the pointer to the queue coming from rename. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
+    /** Sets the poitner to the IEW stage. */
+    void setIEWStage(IEW *iew_stage);
+
+    /** The pointer to the IEW stage. Used solely to ensure that syscalls do
+     * not execute until all stores have written back.
+     */
+    IEW *iewStage;
+
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets pointer to the commited state rename map. */
+    void setRenameMap(RenameMap rm_ptr[Impl::MaxThreads]);
+
+    /** Sets pointer to the ROB. */
     void setROB(ROB *rob_ptr);
 
+    /** Initializes stage by sending back the number of free entries. */
+    void initStage();
+
+    /** Ticks the commit stage, which tries to commit instructions. */
     void tick();
 
+    /** Handles any squashes that are sent from IEW, and adds instructions
+     * to the ROB and tries to commit instructions.
+     */
     void commit();
 
+    /** Returns the number of free ROB entries for a specific thread. */
+    unsigned numROBFreeEntries(unsigned tid);
+
+    void generateXCEvent(unsigned tid);
+
   private:
+    /** Updates the overall status of commit with the nextStatus, and
+     * tell the CPU if commit is active/inactive. */
+    void updateStatus();
+
+    /** Sets the next status based on threads' statuses, which becomes the
+     * current status at the end of the cycle.
+     */
+    void setNextStatus();
 
+    /** Checks if the ROB is completed with squashing. This is for the case
+     * where the ROB can take multiple cycles to complete squashing.
+     */
+    bool robDoneSquashing();
+
+    /** Returns if any of the threads have the number of ROB entries changed
+     * on this cycle. Used to determine if the number of free ROB entries needs
+     * to be sent back to previous stages.
+     */
+    bool changedROBEntries();
+
+    void squashFromTrap(unsigned tid);
+
+    void squashFromXC(unsigned tid);
+
+    void squashInFlightInsts(unsigned tid);
+
+  private:
+    /** Commits as many instructions as possible. */
     void commitInsts();
 
+    /** Tries to commit the head ROB instruction passed in.
+     * @param head_inst The instruction to be committed.
+     */
     bool commitHead(DynInstPtr &head_inst, unsigned inst_num);
 
+    void generateTrapEvent(unsigned tid);
+
+    /** Gets instructions from rename and inserts them into the ROB. */
     void getInsts();
 
+    /** Marks completed instructions using information sent from IEW. */
     void markCompletedInsts();
 
+    /** Gets the thread to commit, based on the SMT policy. */
+    int getCommittingThread();
+
+    /** Returns the thread ID to use based on a round robin policy. */
+    int roundRobin();
+
+    /** Returns the thread ID to use based on an oldest instruction policy. */
+    int oldestReady();
+
   public:
-    uint64_t readCommitPC();
+    /** Returns the PC of the head instruction of the ROB. */
+    uint64_t readPC();
+
+    uint64_t readPC(unsigned tid) { return PC[tid]; }
 
-    void setSquashing() { _status = ROBSquashing; }
+    void setPC(uint64_t val, unsigned tid) { PC[tid] = val; }
+
+    uint64_t readNextPC(unsigned tid) { return nextPC[tid]; }
+
+    void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; }
+
+    /** Sets that the ROB is currently squashing. */
+    void setSquashing(unsigned tid);
 
   private:
     /** Time buffer interface. */
@@ -124,6 +266,10 @@ class SimpleCommit
     /** Wire to read information from IEW (for ROB). */
     typename TimeBuffer<TimeStruct>::wire robInfoFromIEW;
 
+    TimeBuffer<FetchStruct> *fetchQueue;
+
+    typename TimeBuffer<FetchStruct>::wire fromFetch;
+
     /** IEW instruction queue interface. */
     TimeBuffer<IEWStruct> *iewQueue;
 
@@ -136,22 +282,56 @@ class SimpleCommit
     /** Wire to read information from rename queue. */
     typename TimeBuffer<RenameStruct>::wire fromRename;
 
+  public:
     /** ROB interface. */
     ROB *rob;
 
+  private:
     /** Pointer to FullCPU. */
     FullCPU *cpu;
 
     /** Memory interface.  Used for d-cache accesses. */
     MemInterface *dcacheInterface;
 
+    std::vector<Thread *> thread;
+
   private:
+    Fault fetchFault;
+    InstSeqNum fetchFaultSN;
+    int fetchTrapWait;
+    /** Records that commit has written to the time buffer this cycle. Used for
+     * the CPU to determine if it can deschedule itself if there is no activity.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Records if the number of ROB entries has changed this cycle. If it has,
+     * then the number of free entries must be re-broadcast.
+     */
+    bool changedROBNumEntries[Impl::MaxThreads];
+
+    /** A counter of how many threads are currently squashing. */
+    int squashCounter;
+
+    /** Records if a thread has to squash this cycle due to a trap. */
+    bool trapSquash[Impl::MaxThreads];
+
+    /** Records if a thread has to squash this cycle due to an XC write. */
+    bool xcSquash[Impl::MaxThreads];
+
+    /** Priority List used for Commit Policy */
+    std::list<unsigned> priority_list;
+
     /** IEW to Commit delay, in ticks. */
     unsigned iewToCommitDelay;
 
+    /** Commit to IEW delay, in ticks. */
+    unsigned commitToIEWDelay;
+
     /** Rename to ROB delay, in ticks. */
     unsigned renameToROBDelay;
 
+    unsigned fetchToCommitDelay;
+
     /** Rename width, in instructions.  Used so ROB knows how many
      *  instructions to get from the rename instruction queue.
      */
@@ -165,16 +345,53 @@ class SimpleCommit
     /** Commit width, in instructions. */
     unsigned commitWidth;
 
+    /** Number of Reorder Buffers */
+    unsigned numRobs;
+
+    /** Number of Active Threads */
+    unsigned numThreads;
+
+    Tick trapLatency;
+
+    Tick fetchTrapLatency;
+    Tick fetchFaultTick;
+
+    Addr PC[Impl::MaxThreads];
+
+    Addr nextPC[Impl::MaxThreads];
+
+    /** The sequence number of the youngest valid instruction in the ROB. */
+    InstSeqNum youngestSeqNum[Impl::MaxThreads];
+
+    /** Pointer to the list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Rename map interface. */
+    RenameMap *renameMap[Impl::MaxThreads];
+
+    /** Stat for the total number of committed instructions. */
     Stats::Scalar<> commitCommittedInsts;
+    /** Stat for the total number of squashed instructions discarded by commit.
+     */
     Stats::Scalar<> commitSquashedInsts;
+    /** Stat for the total number of times commit is told to squash.
+     * @todo: Actually increment this stat.
+     */
     Stats::Scalar<> commitSquashEvents;
+    /** Stat for the total number of times commit has had to stall due to a non-
+     * speculative instruction reaching the head of the ROB.
+     */
     Stats::Scalar<> commitNonSpecStalls;
+    /** Stat for the total number of committed branches. */
     Stats::Scalar<> commitCommittedBranches;
+    /** Stat for the total number of committed loads. */
     Stats::Scalar<> commitCommittedLoads;
+    /** Stat for the total number of committed memory references. */
     Stats::Scalar<> commitCommittedMemRefs;
+    /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar<> branchMispredicts;
-
-    Stats::Distribution<> n_committed_dist;
+    /** Distribution of the number of committed instructions each cycle. */
+    Stats::Distribution<> numCommittedDist;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_COMMIT_HH__
+#endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index e289bc0c0..ef1ba9282 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -26,25 +26,112 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <stdio.h>
+#include <string.h>
+
+#include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
-#include "cpu/o3/commit.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/o3/commit.hh"
+#include "cpu/o3/thread_state.hh"
+
+using namespace std;
+
+template <class Impl>
+DefaultCommit<Impl>::TrapEvent::TrapEvent(DefaultCommit<Impl> *_commit,
+                                          unsigned _tid)
+    : Event(&mainEventQueue, CPU_Tick_Pri), commit(_commit), tid(_tid)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::TrapEvent::process()
+{
+    commit->trapSquash[tid] = true;
+}
+
+template <class Impl>
+const char *
+DefaultCommit<Impl>::TrapEvent::description()
+{
+    return "Trap event";
+}
+
+template <class Impl>
+DefaultCommit<Impl>::DefaultCommit(Params *params)
+    : dcacheInterface(params->dcacheInterface),
+      squashCounter(0),
+      iewToCommitDelay(params->iewToCommitDelay),
+      commitToIEWDelay(params->commitToIEWDelay),
+      renameToROBDelay(params->renameToROBDelay),
+      fetchToCommitDelay(params->commitToFetchDelay),
+      renameWidth(params->renameWidth),
+      iewWidth(params->executeWidth),
+      commitWidth(params->commitWidth),
+      numThreads(params->numberOfThreads)
+{
+    _status = Active;
+    _nextStatus = Inactive;
+    string policy = params->smtCommitPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Assign commit policy
+    if (policy == "aggressive"){
+        commitPolicy = Aggressive;
+
+        DPRINTF(Commit,"Commit Policy set to Aggressive.");
+    } else if (policy == "roundrobin"){
+        commitPolicy = RoundRobin;
+
+        //Set-Up Priority List
+        for (int tid=0; tid < numThreads; tid++) {
+            priority_list.push_back(tid);
+        }
+
+        DPRINTF(Commit,"Commit Policy set to Round Robin.");
+    } else if (policy == "oldestready"){
+        commitPolicy = OldestReady;
+
+        DPRINTF(Commit,"Commit Policy set to Oldest Ready.");
+    } else {
+        assert(0 && "Invalid SMT Commit Policy. Options Are: {Aggressive,"
+               "RoundRobin,OldestReady}");
+    }
+
+    for (int i=0; i < numThreads; i++) {
+        commitStatus[i] = Idle;
+        changedROBNumEntries[i] = false;
+        trapSquash[i] = false;
+        xcSquash[i] = false;
+    }
+
+    // Hardcoded trap latency.
+    trapLatency = 6;
+    fetchTrapLatency = 12;
+    fetchFaultTick = 0;
+    fetchTrapWait = 0;
+}
 
 template <class Impl>
-SimpleCommit<Impl>::SimpleCommit(Params &params)
-    : dcacheInterface(params.dcacheInterface),
-      iewToCommitDelay(params.iewToCommitDelay),
-      renameToROBDelay(params.renameToROBDelay),
-      renameWidth(params.renameWidth),
-      iewWidth(params.executeWidth),
-      commitWidth(params.commitWidth)
+std::string
+DefaultCommit<Impl>::name() const
 {
-    _status = Idle;
+    return cpu->name() + ".commit";
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::regStats()
+DefaultCommit<Impl>::regStats()
 {
     commitCommittedInsts
         .name(name() + ".commitCommittedInsts")
@@ -79,7 +166,7 @@ SimpleCommit<Impl>::regStats()
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
         .prereq(branchMispredicts);
-    n_committed_dist
+    numCommittedDist
         .init(0,commitWidth,1)
         .name(name() + ".COM:committed_per_cycle")
         .desc("Number of insts commited each cycle")
@@ -89,15 +176,26 @@ SimpleCommit<Impl>::regStats()
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     DPRINTF(Commit, "Commit: Setting CPU pointer.\n");
     cpu = cpu_ptr;
+
+    // Commit must broadcast the number of free entries it has at the start of
+    // the simulation, so it starts as active.
+    cpu->activateStage(FullCPU::CommitIdx);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setThreads(vector<Thread *> &threads)
+{
+    thread = threads;
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
     DPRINTF(Commit, "Commit: Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
@@ -111,7 +209,18 @@ SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultCommit<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+{
+    DPRINTF(Commit, "Commit: Setting fetch queue pointer.\n");
+    fetchQueue = fq_ptr;
+
+    // Setup wire to get instructions from rename (for the ROB).
+    fromFetch = fetchQueue->getWire(-fetchToCommitDelay);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
     DPRINTF(Commit, "Commit: Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
@@ -122,7 +231,7 @@ SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
+DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 {
     DPRINTF(Commit, "Commit: Setting IEW queue pointer.\n");
     iewQueue = iq_ptr;
@@ -133,7 +242,33 @@ SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::setROB(ROB *rob_ptr)
+DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
+{
+    iewStage = iew_stage;
+}
+
+template<class Impl>
+void
+DefaultCommit<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Commit, "Commit: Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setRenameMap(RenameMap rm_ptr[])
+{
+    DPRINTF(Commit, "Setting rename map pointers.\n");
+
+    for (int i=0; i < numThreads; i++) {
+        renameMap[i] = &rm_ptr[i];
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setROB(ROB *rob_ptr)
 {
     DPRINTF(Commit, "Commit: Setting ROB pointer.\n");
     rob = rob_ptr;
@@ -141,41 +276,317 @@ SimpleCommit<Impl>::setROB(ROB *rob_ptr)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::tick()
+DefaultCommit<Impl>::initStage()
+{
+    rob->setActiveThreads(activeThreads);
+    rob->resetEntries();
+
+    // Broadcast the number of free entries.
+    for (int i=0; i < numThreads; i++) {
+        toIEW->commitInfo[i].usedROB = true;
+        toIEW->commitInfo[i].freeROBEntries = rob->numFreeEntries(i);
+    }
+
+    cpu->activityThisCycle();
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::updateStatus()
+{
+    if (commitStatus[0] == TrapPending ||
+        commitStatus[0] == FetchTrapPending) {
+        _nextStatus = Active;
+    }
+
+    if (_nextStatus == Inactive && _status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+        cpu->deactivateStage(FullCPU::CommitIdx);
+    } else if (_nextStatus == Active && _status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+        cpu->activateStage(FullCPU::CommitIdx);
+    }
+
+    _status = _nextStatus;
+
+    // reset ROB changed variable
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        changedROBNumEntries[tid] = false;
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setNextStatus()
+{
+    int squashes = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (commitStatus[tid] == ROBSquashing) {
+            squashes++;
+        }
+    }
+
+    assert(squashes == squashCounter);
+
+    // If commit is currently squashing, then it will have activity for the
+    // next cycle. Set its next status as active.
+    if (squashCounter) {
+        _nextStatus = Active;
+    }
+}
+
+template <class Impl>
+bool
+DefaultCommit<Impl>::changedROBEntries()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (changedROBNumEntries[tid]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+template <class Impl>
+unsigned
+DefaultCommit<Impl>::numROBFreeEntries(unsigned tid)
+{
+    return rob->numFreeEntries(tid);
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::generateTrapEvent(unsigned tid)
+{
+    DPRINTF(Commit, "Generating trap event for [tid:%i]\n", tid);
+
+    TrapEvent *trap = new TrapEvent(this, tid);
+
+    trap->schedule(curTick + trapLatency);
+
+    thread[tid]->trapPending = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::generateXCEvent(unsigned tid)
 {
+    DPRINTF(Commit, "Generating XC squash event for [tid:%i]\n", tid);
+
+    xcSquash[tid] = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+{
+    // If we want to include the squashing instruction in the squash,
+    // then use one older sequence number.
+    // Hopefully this doesn't mess things up.  Basically I want to squash
+    // all instructions of this thread.
+    InstSeqNum squashed_inst = rob->isEmpty() ?
+        0 : rob->readHeadInst(tid)->seqNum - 1;
+
+    // All younger instructions will be squashed. Set the sequence
+    // number as the youngest instruction in the ROB (0 in this case.
+    // Hopefully nothing breaks.)
+    youngestSeqNum[tid] = 0;
+
+    rob->squash(squashed_inst, tid);
+    changedROBNumEntries[tid] = true;
+
+    // Send back the sequence number of the squashed instruction.
+    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
+
+    // Send back the squash signal to tell stages that they should
+    // squash.
+    toIEW->commitInfo[tid].squash = true;
+
+    // Send back the rob squashing signal so other stages know that
+    // the ROB is in the process of squashing.
+    toIEW->commitInfo[tid].robSquashing = true;
+
+    toIEW->commitInfo[tid].branchMispredict = false;
+
+//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
+
+    toIEW->commitInfo[tid].nextPC = PC[tid];
+
+    DPRINTF(Commit, "Squashing from trap, restarting at PC %#x\n", PC[tid]);
+    // Hopefully nobody tries to use the mispredPC becuase I said there
+    // wasn't a branch mispredict.
+//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
+
+    thread[tid]->trapPending = false;
+    thread[tid]->inSyscall = false;
+
+    trapSquash[tid] = false;
+
+    // Not sure what to set this to...
+    commitStatus[tid] = ROBSquashing;
+    cpu->activityThisCycle();
+
+    ++squashCounter;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromXC(unsigned tid)
+{
+    // For now these are identical.  In the future, the squash from trap
+    // might execute the trap prior to the squash.
+
+    // If we want to include the squashing instruction in the squash,
+    // then use one older sequence number.
+    // Hopefully this doesn't mess things up.  Basically I want to squash
+    // all instructions of this thread.
+    InstSeqNum squashed_inst = rob->isEmpty() ?
+        0 : rob->readHeadInst(tid)->seqNum - 1;;
+
+    // All younger instructions will be squashed. Set the sequence
+    // number as the youngest instruction in the ROB (0 in this case.
+    // Hopefully nothing breaks.)
+    youngestSeqNum[tid] = 0;
+
+    rob->squash(squashed_inst, tid);
+    changedROBNumEntries[tid] = true;
+
+    // Send back the sequence number of the squashed instruction.
+    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
+
+    // Send back the squash signal to tell stages that they should
+    // squash.
+    toIEW->commitInfo[tid].squash = true;
+
+    // Send back the rob squashing signal so other stages know that
+    // the ROB is in the process of squashing.
+    toIEW->commitInfo[tid].robSquashing = true;
+
+    toIEW->commitInfo[tid].branchMispredict = false;
+
+//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
+
+    toIEW->commitInfo[tid].nextPC = PC[tid];
+
+    DPRINTF(Commit, "Squashing from XC, restarting at PC %#x\n", PC[tid]);
+    // Hopefully nobody tries to use the mispredPC becuase I said there
+    // wasn't a branch mispredict.
+//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
+
+    thread[tid]->inSyscall = false;
+    assert(!thread[tid]->trapPending);
+    // Not sure what to set this to...
+    commitStatus[tid] = ROBSquashing;
+    cpu->activityThisCycle();
+
+    xcSquash[tid] = false;
+
+    ++squashCounter;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashInFlightInsts(unsigned tid)
+{
+    // @todo: Fix this hardcoded number.
+    for (int i = 0; i < -5; ++i) {
+        for (int j = 0; j < (*iewQueue)[i].size; ++j) {
+            DynInstPtr inst = (*iewQueue)[i].insts[j];
+            if (inst->threadNumber == tid &&
+                !inst->isSquashed()) {
+                inst->setSquashed();
+            }
+        }
+    }
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::tick()
+{
+    wroteToTimeBuffer = false;
+    _nextStatus = Inactive;
+
     // If the ROB is currently in its squash sequence, then continue
     // to squash.  In this case, commit does not do anything.  Otherwise
     // run commit.
-    if (_status == ROBSquashing) {
-        if (rob->isDoneSquashing()) {
-            _status = Running;
-        } else {
-            rob->doSquash();
-
-            // Send back sequence number of tail of ROB, so other stages
-            // can squash younger instructions.  Note that really the only
-            // stage that this is important for is the IEW stage; other
-            // stages can just clear all their state as long as selective
-            // replay isn't used.
-            toIEW->commitInfo.doneSeqNum = rob->readTailSeqNum();
-            toIEW->commitInfo.robSquashing = true;
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    // Maybe this should be dependent upon any of the commits actually
+    // squashing.
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (commitStatus[tid] == ROBSquashing) {
+
+            if (rob->isDoneSquashing(tid)) {
+                commitStatus[tid] = Running;
+                --squashCounter;
+            } else {
+                DPRINTF(Commit,"[tid:%u]: Still Squashing, cannot commit any"
+                        "insts this cycle.\n", tid);
+            }
         }
-    } else {
-        commit();
     }
 
+    commit();
+
     markCompletedInsts();
 
-    // Writeback number of free ROB entries here.
-    DPRINTF(Commit, "Commit: ROB has %d free entries.\n",
-            rob->numFreeEntries());
-    toIEW->commitInfo.freeROBEntries = rob->numFreeEntries();
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isEmpty(tid) && rob->readHeadInst(tid)->readyToCommit()) {
+            // The ROB has more instructions it can commit. Its next status
+            // will be active.
+            _nextStatus = Active;
+
+            DynInstPtr inst = rob->readHeadInst(tid);
+
+            DPRINTF(Commit,"[tid:%i]: Instruction [sn:%lli] PC %#x is head of"
+                    " ROB and ready to commit\n",
+                    tid, inst->seqNum, inst->readPC());
+
+        } else if (!rob->isEmpty(tid)) {
+            DynInstPtr inst = rob->readHeadInst(tid);
+
+            DPRINTF(Commit,"[tid:%i]: Can't commit, Instruction [sn:%lli] PC "
+                    "%#x is head of ROB and not ready\n",
+                    tid, inst->seqNum, inst->readPC());
+        }
+
+        DPRINTF(Commit, "[tid:%i]: ROB has %d insts & %d free entries.\n",
+                tid, rob->countInsts(tid), rob->numFreeEntries(tid));
+    }
+
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity,"Activity This Cycle.\n");
+        cpu->activityThisCycle();
+    }
+
+    updateStatus();
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::commit()
+DefaultCommit<Impl>::commit()
 {
+
     //////////////////////////////////////
     // Check for interrupts
     //////////////////////////////////////
@@ -187,17 +598,44 @@ SimpleCommit<Impl>::commit()
     // hwrei() is what resets the PC to the place where instruction execution
     // beings again.
 #if FULL_SYSTEM
-    if (//checkInterrupts &&
+//#if 0
+    if (cpu->checkInterrupts &&
         cpu->check_interrupts() &&
-        !cpu->inPalMode(readCommitPC())) {
-        // Will need to squash all instructions currently in flight and have
-        // the interrupt handler restart at the last non-committed inst.
-        // Most of that can be handled through the trap() function.  The
-        // processInterrupts() function really just checks for interrupts
-        // and then calls trap() if there is an interrupt present.
-
-        // CPU will handle implementation of the interrupt.
-        cpu->processInterrupts();
+        !cpu->inPalMode(readPC()) &&
+        !trapSquash[0] &&
+        !xcSquash[0]) {
+//        commitStatus[0] = TrapPending;
+        toIEW->commitInfo[0].interruptPending = true;
+        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
+            // Will need to squash all instructions currently in flight and have
+            // the interrupt handler restart at the last non-committed inst.
+            // Most of that can be handled through the trap() function.  The
+            // processInterrupts() function really just checks for interrupts
+            // and then calls trap() if there is an interrupt present.
+
+            // Not sure which thread should be the one to interrupt.  For now
+            // always do thread 0.
+            assert(!thread[0]->inSyscall);
+            thread[0]->inSyscall = true;
+
+            // CPU will handle implementation of the interrupt.
+            cpu->processInterrupts();
+
+            // Now squash or record that I need to squash this cycle.
+            commitStatus[0] = TrapPending;
+
+            // Exit state update mode to avoid accidental updating.
+            thread[0]->inSyscall = false;
+
+            // Generate trap squash event.
+            generateTrapEvent(0);
+
+            toIEW->commitInfo[0].clearInterrupt = true;
+
+            DPRINTF(Commit, "Interrupt detected.\n");
+        } else {
+            DPRINTF(Commit, "Interrupt pending, waiting for ROB to empty.\n");
+        }
     }
 #endif // FULL_SYSTEM
 
@@ -205,43 +643,113 @@ SimpleCommit<Impl>::commit()
     // Check for squash signal, handle that first
     ////////////////////////////////////
 
-    // Want to mainly check if the IEW stage is telling the ROB to squash.
-    // Should I also check if the commit stage is telling the ROB to squah?
-    // This might be necessary to keep the same timing between the IQ and
-    // the ROB...
-    if (fromIEW->squash) {
-        DPRINTF(Commit, "Commit: Squashing instructions in the ROB.\n");
+    // Check if the IEW stage is telling the ROB to squash.
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (fromFetch->fetchFault) {
+            // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
+            fetchFault = fromFetch->fetchFault;
+            fetchFaultSN = fromFetch->fetchFaultSN;
+            fetchFaultTick = curTick + fetchTrapLatency;
+            commitStatus[0] = FetchTrapPending;
+            DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
+                    "ROB empties without squashing the fault.\n");
+            fetchTrapWait = 0;
+        }
+        if (fromFetch->clearFetchFault) {
+            DPRINTF(Commit, "Received clear fetch fault signal\n");
+            fetchTrapWait = 0;
+            if (commitStatus[0] == FetchTrapPending) {
+                DPRINTF(Commit, "Clearing fault from fetch\n");
+                commitStatus[0] = Running;
+            }
+        }
+
+        // Not sure which one takes priority.  I think if we have
+        // both, that's a bad sign.
+        if (trapSquash[tid] == true) {
+            assert(!xcSquash[tid]);
+            squashFromTrap(tid);
+        } else if (xcSquash[tid] == true) {
+            squashFromXC(tid);
+        }
+
+        // Squashed sequence number must be older than youngest valid
+        // instruction in the ROB. This prevents squashes from younger
+        // instructions overriding squashes from older instructions.
+        if (fromIEW->squash[tid] &&
+            commitStatus[tid] != TrapPending &&
+            fromIEW->squashedSeqNum[tid] <= youngestSeqNum[tid]) {
+
+            DPRINTF(Commit, "[tid:%u]: Squashing instructions in the "
+                    "ROB.\n",
+                    tid);
 
-        _status = ROBSquashing;
+            DPRINTF(Commit, "[tid:%i]: Squashing due to PC %#x [sn:%i]\n",
+                    tid,
+                    fromIEW->mispredPC[tid],
+                    fromIEW->squashedSeqNum[tid]);
 
-        InstSeqNum squashed_inst = fromIEW->squashedSeqNum;
+            DPRINTF(Commit, "[tid:%i]: Redirecting to PC %#x\n",
+                    tid,
+                    fromIEW->nextPC[tid]);
 
-        rob->squash(squashed_inst);
+            commitStatus[tid] = ROBSquashing;
 
-        // Send back the sequence number of the squashed instruction.
-        toIEW->commitInfo.doneSeqNum = squashed_inst;
+            ++squashCounter;
 
-        // Send back the squash signal to tell stages that they should squash.
-        toIEW->commitInfo.squash = true;
+            // If we want to include the squashing instruction in the squash,
+            // then use one older sequence number.
+            InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid];
 
-        // Send back the rob squashing signal so other stages know that the
-        // ROB is in the process of squashing.
-        toIEW->commitInfo.robSquashing = true;
+            if (fromIEW->includeSquashInst[tid] == true)
+                squashed_inst--;
 
-        toIEW->commitInfo.branchMispredict = fromIEW->branchMispredict;
+            // All younger instructions will be squashed. Set the sequence
+            // number as the youngest instruction in the ROB.
+            youngestSeqNum[tid] = squashed_inst;
 
-        toIEW->commitInfo.branchTaken = fromIEW->branchTaken;
+            rob->squash(squashed_inst, tid);
+            changedROBNumEntries[tid] = true;
 
-        toIEW->commitInfo.nextPC = fromIEW->nextPC;
+            // Send back the sequence number of the squashed instruction.
+            toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
 
-        toIEW->commitInfo.mispredPC = fromIEW->mispredPC;
+            // Send back the squash signal to tell stages that they should
+            // squash.
+            toIEW->commitInfo[tid].squash = true;
 
-        if (toIEW->commitInfo.branchMispredict) {
-            ++branchMispredicts;
+            // Send back the rob squashing signal so other stages know that
+            // the ROB is in the process of squashing.
+            toIEW->commitInfo[tid].robSquashing = true;
+
+            toIEW->commitInfo[tid].branchMispredict =
+                fromIEW->branchMispredict[tid];
+
+            toIEW->commitInfo[tid].branchTaken =
+                fromIEW->branchTaken[tid];
+
+            toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid];
+
+            DPRINTF(Commit, "Squashing from IEW, restarting at PC %#x\n",
+                    fromIEW->nextPC[tid]);
+
+            toIEW->commitInfo[tid].mispredPC =
+                fromIEW->mispredPC[tid];
+
+            if (toIEW->commitInfo[tid].branchMispredict) {
+                ++branchMispredicts;
+            }
         }
+
     }
 
-    if (_status != ROBSquashing) {
+    setNextStatus();
+
+    if (squashCounter != numThreads) {
         // If we're not currently squashing, then get instructions.
         getInsts();
 
@@ -249,24 +757,29 @@ SimpleCommit<Impl>::commit()
         commitInsts();
     }
 
-    // If the ROB is empty, we can set this stage to idle.  Use this
-    // in the future when the Idle status will actually be utilized.
-#if 0
-    if (rob->isEmpty()) {
-        DPRINTF(Commit, "Commit: ROB is empty.  Status changed to idle.\n");
-        _status = Idle;
-        // Schedule an event so that commit will actually wake up
-        // once something gets put in the ROB.
+    //Check for any activity
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (changedROBNumEntries[tid]) {
+            toIEW->commitInfo[tid].usedROB = true;
+            toIEW->commitInfo[tid].freeROBEntries = rob->numFreeEntries(tid);
+
+            if (rob->isEmpty(tid)) {
+                toIEW->commitInfo[tid].emptyROB = true;
+            }
+
+            wroteToTimeBuffer = true;
+            changedROBNumEntries[tid] = false;
+        }
     }
-#endif
 }
 
-// Loop that goes through as many instructions in the ROB as possible and
-// tries to commit them.  The actual work for committing is done by the
-// commitHead() function.
 template <class Impl>
 void
-SimpleCommit<Impl>::commitInsts()
+DefaultCommit<Impl>::commitInsts()
 {
     ////////////////////////////////////
     // Handle commit
@@ -276,94 +789,193 @@ SimpleCommit<Impl>::commitInsts()
     // Can't commit and squash things at the same time...
     ////////////////////////////////////
 
-    if (rob->isEmpty())
-        return;
-
-    DynInstPtr head_inst = rob->readHeadInst();
+    DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
 
     unsigned num_committed = 0;
 
+    DynInstPtr head_inst;
+#if FULL_SYSTEM
+    if (commitStatus[0] == FetchTrapPending) {
+        DPRINTF(Commit, "Fault from fetch is pending.\n");
+        if (rob->isEmpty()) {
+            fetchTrapWait++;
+            if (fetchTrapWait > 10000000) {
+                panic("Fetch trap has been pending for a long time!");
+            }
+            if (fetchFaultTick > curTick) {
+                DPRINTF(Commit, "Not enough cycles since fault, fault will "
+                        "happen on %lli\n",
+                        fetchFaultTick);
+                cpu->activityThisCycle();
+                return;
+            } else if (iewStage->hasStoresToWB()) {
+                DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
+                        "they are completed. fetchTrapWait:%i\n",
+                        fetchTrapWait);
+                cpu->activityThisCycle();
+                return;
+            } else if (cpu->inPalMode(readPC())) {
+                DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
+                        fetchTrapWait);
+                return;
+            }
+            fetchTrapWait = 0;
+            DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
+
+            assert(!thread[0]->inSyscall);
+
+            thread[0]->inSyscall = true;
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            cpu->trap(fetchFault, 0);
+
+            // Exit state update mode to avoid accidental updating.
+            thread[0]->inSyscall = false;
+
+            commitStatus[0] = TrapPending;
+            // Set it up so that we squash next cycle
+            trapSquash[0] = true;
+            return;
+        }
+    }
+#endif
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
-    while (!rob->isEmpty() &&
-           head_inst->readyToCommit() &&
-           num_committed < commitWidth)
-    {
-        DPRINTF(Commit, "Commit: Trying to commit head instruction.\n");
+    while (num_committed < commitWidth) {
+        int commit_thread = getCommittingThread();
+
+        if (commit_thread == -1 || !rob->isHeadReady(commit_thread))
+            break;
+
+        head_inst = rob->readHeadInst(commit_thread);
+
+        int tid = head_inst->threadNumber;
+
+        assert(tid == commit_thread);
+
+        DPRINTF(Commit, "Trying to commit head instruction, [sn:%i] [tid:%i]\n",
+                head_inst->seqNum, tid);
 
         // If the head instruction is squashed, it is ready to retire at any
         // time.  However, we need to avoid updating any other state
         // incorrectly if it's already been squashed.
         if (head_inst->isSquashed()) {
 
-            DPRINTF(Commit, "Commit: Retiring squashed instruction from "
+            DPRINTF(Commit, "Retiring squashed instruction from "
                     "ROB.\n");
 
             // Tell ROB to retire head instruction.  This retires the head
             // inst in the ROB without affecting any other stages.
-            rob->retireHead();
+            rob->retireHead(commit_thread);
 
             ++commitSquashedInsts;
 
+            // Record that the number of ROB entries has changed.
+            changedROBNumEntries[tid] = true;
         } else {
+            PC[tid] = head_inst->readPC();
+            nextPC[tid] = head_inst->readNextPC();
+
             // Increment the total number of non-speculative instructions
             // executed.
             // Hack for now: it really shouldn't happen until after the
             // commit is deemed to be successful, but this count is needed
             // for syscalls.
-            cpu->funcExeInst++;
+            thread[tid]->funcExeInst++;
 
             // Try to commit the head instruction.
             bool commit_success = commitHead(head_inst, num_committed);
 
-            // Update what instruction we are looking at if the commit worked.
             if (commit_success) {
                 ++num_committed;
 
-                // Send back which instruction has been committed.
-                // @todo: Update this later when a wider pipeline is used.
-                // Hmm, can't really give a pointer here...perhaps the
-                // sequence number instead (copy).
-                toIEW->commitInfo.doneSeqNum = head_inst->seqNum;
+                // Record that the number of ROB entries has changed.
+                changedROBNumEntries[tid] = true;
+
+                // Set the doneSeqNum to the youngest committed instruction.
+                toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
 
                 ++commitCommittedInsts;
 
-                if (!head_inst->isNop()) {
-                    cpu->instDone();
+                // To match the old model, don't count nops and instruction
+                // prefetches towards the total commit count.
+                if (!head_inst->isNop() && !head_inst->isInstPrefetch()) {
+                    cpu->instDone(tid);
                 }
+
+                PC[tid] = nextPC[tid];
+#if FULL_SYSTEM
+                int count = 0;
+                Addr oldpc;
+                do {
+                    if (count == 0)
+                        assert(!thread[tid]->inSyscall && !thread[tid]->trapPending);
+                    oldpc = PC[tid];
+                    cpu->system->pcEventQueue.service(
+                        thread[tid]->getXCProxy());
+                    count++;
+                } while (oldpc != PC[tid]);
+                if (count > 1) {
+                    DPRINTF(Commit, "PC skip function event, stopping commit\n");
+                    break;
+                }
+#endif
             } else {
+                DPRINTF(Commit, "Unable to commit head instruction PC:%#x "
+                        "[tid:%i] [sn:%i].\n",
+                        head_inst->readPC(), tid ,head_inst->seqNum);
                 break;
             }
         }
-
-        // Update the pointer to read the next instruction in the ROB.
-        head_inst = rob->readHeadInst();
     }
 
     DPRINTF(CommitRate, "%i\n", num_committed);
-    n_committed_dist.sample(num_committed);
+    numCommittedDist.sample(num_committed);
 }
 
 template <class Impl>
 bool
-SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
+DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 {
     // Make sure instruction is valid
     assert(head_inst);
 
+    int tid = head_inst->threadNumber;
+
     // If the instruction is not executed yet, then it is a non-speculative
     // or store inst.  Signal backwards that it should be executed.
     if (!head_inst->isExecuted()) {
         // Keep this number correct.  We have not yet actually executed
         // and committed this instruction.
-        cpu->funcExeInst--;
+        thread[tid]->funcExeInst--;
+
+        head_inst->reachedCommit = true;
+
+        if (head_inst->isNonSpeculative() ||
+            head_inst->isMemBarrier() ||
+            head_inst->isWriteBarrier()) {
+#if !FULL_SYSTEM
+            // Hack to make sure syscalls aren't executed until all stores
+            // write back their data.  This direct communication shouldn't
+            // be used for anything other than this.
+            if (inst_num > 0 || iewStage->hasStoresToWB())
+#else
+            if ((head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
+                    head_inst->isQuiesce()) &&
+                iewStage->hasStoresToWB())
+#endif
+            {
+                DPRINTF(Commit, "Waiting for all stores to writeback.\n");
+                return false;
+            }
 
-        if (head_inst->isNonSpeculative()) {
-            DPRINTF(Commit, "Commit: Encountered a store or non-speculative "
-                    "instruction at the head of the ROB, PC %#x.\n",
-                    head_inst->readPC());
+            DPRINTF(Commit, "Encountered a barrier or non-speculative "
+                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
 
-            toIEW->commitInfo.nonSpecSeqNum = head_inst->seqNum;
+            // Send back the non-speculative instruction's sequence number.
+            toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
 
             // Change the instruction so it won't try to commit again until
             // it is executed.
@@ -371,25 +983,34 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
             ++commitNonSpecStalls;
 
+            return false;
+        } else if (head_inst->isLoad()) {
+            DPRINTF(Commit, "[sn:%lli]: Uncached load, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
+
+            // Send back the non-speculative instruction's sequence
+            // number.  Maybe just tell the lsq to re-execute the load.
+            toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
+            toIEW->commitInfo[tid].uncached = true;
+            toIEW->commitInfo[tid].uncachedLoad = head_inst;
+
+            head_inst->clearCanCommit();
+
             return false;
         } else {
-            panic("Commit: Trying to commit un-executed instruction "
+            panic("Trying to commit un-executed instruction "
                   "of unknown type!\n");
         }
     }
 
     // Now check if it's one of the special trap or barrier or
     // serializing instructions.
-    if (head_inst->isThreadSync()  ||
-        head_inst->isSerializing() ||
-        head_inst->isMemBarrier()  ||
-        head_inst->isWriteBarrier() )
+    if (head_inst->isThreadSync())/*  ||
+//        head_inst->isMemBarrier()  ||
+head_inst->isWriteBarrier())*/
     {
-        // Not handled for now.  Mem barriers and write barriers are safe
-        // to simply let commit as memory accesses only happen once they
-        // reach the head of commit.  Not sure about the other two.
-        panic("Serializing or barrier instructions"
-              " are not handled yet.\n");
+        // Not handled for now.
+        panic("Barrier instructions are not handled yet.\n");
     }
 
     // Check if the instruction caused a fault.  If so, trap.
@@ -398,7 +1019,32 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     if (inst_fault != NoFault) {
         if (!head_inst->isNop()) {
 #if FULL_SYSTEM
-            cpu->trap(inst_fault);
+            DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
+                    head_inst->seqNum, head_inst->readPC());
+
+            assert(!thread[tid]->inSyscall);
+
+            thread[tid]->inSyscall = true;
+
+            // Hack for now; DTB will sometimes need the machine instruction
+            // for when faults happen.  So we will set it here, prior to the
+            // DTB possibly needing it for this translation.
+            thread[tid]->setInst(
+                static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
+
+            // Consider holding onto the trap and waiting until the trap event
+            // happens for this to be executed.
+            cpu->trap(inst_fault, tid);
+
+            // Exit state update mode to avoid accidental updating.
+            thread[tid]->inSyscall = false;
+
+            commitStatus[tid] = TrapPending;
+
+            // Generate trap squash event.
+            generateTrapEvent(tid);
+
+            return false;
 #else // !FULL_SYSTEM
             panic("fault (%d) detected @ PC %08p", inst_fault,
                   head_inst->PC);
@@ -409,37 +1055,32 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     // Check if we're really ready to commit.  If not then return false.
     // I'm pretty sure all instructions should be able to commit if they've
     // reached this far.  For now leave this in as a check.
-    if (!rob->isHeadReady()) {
-        panic("Commit: Unable to commit head instruction!\n");
+    if (!rob->isHeadReady(tid)) {
+        panic("Unable to commit head instruction!\n");
         return false;
     }
 
-    // If it's a branch, then send back branch prediction update info
-    // to the fetch stage.
-    // This should be handled in the iew stage if a mispredict happens...
-
     if (head_inst->isControl()) {
-
-#if 0
-        toIEW->nextPC = head_inst->readPC();
-        //Maybe switch over to BTB incorrect.
-        toIEW->btbMissed = head_inst->btbMiss();
-        toIEW->target = head_inst->nextPC;
-        //Maybe also include global history information.
-        //This simple version will have no branch prediction however.
-#endif
-
         ++commitCommittedBranches;
     }
 
     // Now that the instruction is going to be committed, finalize its
     // trace data.
     if (head_inst->traceData) {
+        head_inst->traceData->setFetchSeq(head_inst->seqNum);
+        head_inst->traceData->setCPSeq(thread[tid]->numInst);
         head_inst->traceData->finalize();
+        head_inst->traceData = NULL;
     }
 
-    //Finally clear the head ROB entry.
-    rob->retireHead();
+    // Update the commit rename map
+    for (int i = 0; i < head_inst->numDestRegs(); i++) {
+        renameMap[tid]->setEntry(head_inst->destRegIdx(i),
+                                 head_inst->renamedDestRegIdx(i));
+    }
+
+    // Finally clear the head ROB entry.
+    rob->retireHead(tid);
 
     // Return true to indicate that we have committed an instruction.
     return true;
@@ -447,37 +1088,45 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
 template <class Impl>
 void
-SimpleCommit<Impl>::getInsts()
+DefaultCommit<Impl>::getInsts()
 {
     //////////////////////////////////////
     // Handle ROB functions
     //////////////////////////////////////
 
-    // Read any issued instructions and place them into the ROB.  Do this
+    // Read any renamed instructions and place them into the ROB.  Do this
     // prior to squashing to avoid having instructions in the ROB that
     // don't get squashed properly.
     int insts_to_process = min((int)renameWidth, fromRename->size);
 
-    for (int inst_num = 0;
-         inst_num < insts_to_process;
-         ++inst_num)
+    for (int inst_num = 0; inst_num < insts_to_process; ++inst_num)
     {
-        if (!fromRename->insts[inst_num]->isSquashed()) {
-            DPRINTF(Commit, "Commit: Inserting PC %#x into ROB.\n",
-                    fromRename->insts[inst_num]->readPC());
-            rob->insertInst(fromRename->insts[inst_num]);
+        DynInstPtr inst = fromRename->insts[inst_num];
+        int tid = inst->threadNumber;
+
+        if (!inst->isSquashed() &&
+            commitStatus[tid] != ROBSquashing) {
+            changedROBNumEntries[tid] = true;
+
+            DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ROB.\n",
+                    inst->readPC(), inst->seqNum, tid);
+
+            rob->insertInst(inst);
+
+            assert(rob->getThreadEntries(tid) <= rob->getMaxEntries(tid));
+
+            youngestSeqNum[tid] = inst->seqNum;
         } else {
-            DPRINTF(Commit, "Commit: Instruction %i PC %#x was "
+            DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
                     "squashed, skipping.\n",
-                    fromRename->insts[inst_num]->seqNum,
-                    fromRename->insts[inst_num]->readPC());
+                    inst->readPC(), inst->seqNum, tid);
         }
     }
 }
 
 template <class Impl>
 void
-SimpleCommit<Impl>::markCompletedInsts()
+DefaultCommit<Impl>::markCompletedInsts()
 {
     // Grab completed insts out of the IEW instruction queue, and mark
     // instructions completed within the ROB.
@@ -485,18 +1134,159 @@ SimpleCommit<Impl>::markCompletedInsts()
          inst_num < fromIEW->size && fromIEW->insts[inst_num];
          ++inst_num)
     {
-        DPRINTF(Commit, "Commit: Marking PC %#x, SN %i ready within ROB.\n",
-                fromIEW->insts[inst_num]->readPC(),
-                fromIEW->insts[inst_num]->seqNum);
-
-        // Mark the instruction as ready to commit.
-        fromIEW->insts[inst_num]->setCanCommit();
+        if (!fromIEW->insts[inst_num]->isSquashed()) {
+            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, SN %i ready within ROB.\n",
+                    fromIEW->insts[inst_num]->threadNumber,
+                    fromIEW->insts[inst_num]->readPC(),
+                    fromIEW->insts[inst_num]->seqNum);
+
+            // Mark the instruction as ready to commit.
+            fromIEW->insts[inst_num]->setCanCommit();
+        }
     }
 }
 
 template <class Impl>
 uint64_t
-SimpleCommit<Impl>::readCommitPC()
+DefaultCommit<Impl>::readPC()
+{
+    // @todo: Fix this single thread hack.
+    return PC[0];
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::setSquashing(unsigned tid)
 {
-    return rob->readHeadPC();
+    if (_status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+        _status = Active;
+        cpu->activateStage(FullCPU::CommitIdx);
+    }
+
+    if (commitStatus[tid] != ROBSquashing) {
+        commitStatus[tid] = ROBSquashing;
+        ++squashCounter;
+    }
+}
+
+template <class Impl>
+bool
+DefaultCommit<Impl>::robDoneSquashing()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isDoneSquashing(tid))
+            return false;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////
+//                                    //
+//   SMT COMMIT POLICY MAITAINED HERE //
+//                                    //
+////////////////////////////////////////
+template <class Impl>
+int
+DefaultCommit<Impl>::getCommittingThread()
+{
+    if (numThreads > 1) {
+        switch (commitPolicy) {
+
+          case Aggressive:
+            //If Policy is Aggressive, commit will call
+            //this function multiple times per
+            //cycle
+            return oldestReady();
+
+          case RoundRobin:
+            return roundRobin();
+
+          case OldestReady:
+            return oldestReady();
+
+          default:
+            return -1;
+        }
+    } else {
+        int tid = (*activeThreads).front();
+
+        if (commitStatus[tid] == Running ||
+            commitStatus[tid] == Idle ||
+            commitStatus[tid] == FetchTrapPending) {
+            return tid;
+        } else {
+            return -1;
+        }
+    }
+}
+
+template<class Impl>
+int
+DefaultCommit<Impl>::roundRobin()
+{
+    list<unsigned>::iterator pri_iter = priority_list.begin();
+    list<unsigned>::iterator end      = priority_list.end();
+
+    while (pri_iter != end) {
+        unsigned tid = *pri_iter;
+
+        if (commitStatus[tid] == Running ||
+            commitStatus[tid] == Idle) {
+
+            if (rob->isHeadReady(tid)) {
+                priority_list.erase(pri_iter);
+                priority_list.push_back(tid);
+
+                return tid;
+            }
+        }
+
+        pri_iter++;
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultCommit<Impl>::oldestReady()
+{
+    unsigned oldest = 0;
+    bool first = true;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (!rob->isEmpty(tid) &&
+            (commitStatus[tid] == Running ||
+             commitStatus[tid] == Idle ||
+             commitStatus[tid] == FetchTrapPending)) {
+
+            if (rob->isHeadReady(tid)) {
+
+                DynInstPtr head_inst = rob->readHeadInst(tid);
+
+                if (first) {
+                    oldest = tid;
+                    first = false;
+                } else if (head_inst->seqNum < oldest) {
+                    oldest = tid;
+                }
+            }
+        }
+    }
+
+    if (!first) {
+        return oldest;
+    } else {
+        return -1;
+    }
 }
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 62d68bb33..d322037bc 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -41,11 +41,19 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/cpu.hh"
 
+#include "sim/stat_control.hh"
+
 using namespace std;
 
-BaseFullCPU::BaseFullCPU(Params &params)
-    : BaseCPU(&params), cpu_id(0)
+BaseFullCPU::BaseFullCPU(Params *params)
+    : BaseCPU(params), cpu_id(0)
+{
+}
+
+void
+BaseFullCPU::regStats()
 {
+    BaseCPU::regStats();
 }
 
 template <class Impl>
@@ -70,96 +78,76 @@ FullO3CPU<Impl>::TickEvent::description()
 
 //Call constructor to all the pipeline stages here
 template <class Impl>
-FullO3CPU<Impl>::FullO3CPU(Params &params)
-#if FULL_SYSTEM
-    : BaseFullCPU(params),
-#else
+FullO3CPU<Impl>::FullO3CPU(Params *params)
     : BaseFullCPU(params),
-#endif // FULL_SYSTEM
       tickEvent(this),
+      removeInstsThisCycle(false),
       fetch(params),
       decode(params),
       rename(params),
       iew(params),
       commit(params),
 
-      regFile(params.numPhysIntRegs, params.numPhysFloatRegs),
+      regFile(params->numPhysIntRegs, params->numPhysFloatRegs),
 
-      freeList(TheISA::NumIntRegs, params.numPhysIntRegs,
-               TheISA::NumFloatRegs, params.numPhysFloatRegs),
+      freeList(params->numberOfThreads,//number of activeThreads
+               TheISA::NumIntRegs, params->numPhysIntRegs,
+               TheISA::NumFloatRegs, params->numPhysFloatRegs),
 
-      renameMap(TheISA::NumIntRegs, params.numPhysIntRegs,
-                TheISA::NumFloatRegs, params.numPhysFloatRegs,
-                TheISA::NumMiscRegs,
-                TheISA::ZeroReg,
-                TheISA::ZeroReg + TheISA::NumIntRegs),
+      rob(params->numROBEntries, params->squashWidth,
+          params->smtROBPolicy, params->smtROBThreshold,
+          params->numberOfThreads),
 
-      rob(params.numROBEntries, params.squashWidth),
+      scoreboard(params->numberOfThreads,//number of activeThreads
+                 TheISA::NumIntRegs, params->numPhysIntRegs,
+                 TheISA::NumFloatRegs, params->numPhysFloatRegs,
+                 TheISA::NumMiscRegs * number_of_threads,
+                 TheISA::ZeroReg),
 
       // What to pass to these time buffers?
       // For now just have these time buffers be pretty big.
+      // @todo: Make these time buffer sizes parameters.
       timeBuffer(5, 5),
       fetchQueue(5, 5),
       decodeQueue(5, 5),
       renameQueue(5, 5),
       iewQueue(5, 5),
-
-      cpuXC(NULL),
+      activityBuffer(5, 0),
+      activityCount(0),
 
       globalSeqNum(1),
 
 #if FULL_SYSTEM
-      system(params.system),
+      system(params->system),
       memCtrl(system->memctrl),
       physmem(system->physmem),
-      itb(params.itb),
-      dtb(params.dtb),
-      mem(params.mem),
+      mem(params->mem),
 #else
-      // Hardcoded for a single thread!!
-      mem(params.workload[0]->getMemory()),
+      pTable(params->pTable),
 #endif // FULL_SYSTEM
 
-      icacheInterface(params.icacheInterface),
-      dcacheInterface(params.dcacheInterface),
-      deferRegistration(params.defReg),
-      numInsts(0),
-      funcExeInst(0)
+      icacheInterface(params->icacheInterface),
+      dcacheInterface(params->dcacheInterface),
+      deferRegistration(params->deferRegistration)
 {
     _status = Idle;
 
 #if !FULL_SYSTEM
-    thread.resize(this->number_of_threads);
+    thread.resize(number_of_threads);
+    tids.resize(number_of_threads);
 #endif
 
-    for (int i = 0; i < this->number_of_threads; ++i) {
-#if FULL_SYSTEM
-        assert(i == 0);
-        thread[i] = new CPUExecContext(this, 0, system, itb, dtb, mem);
-        system->execContexts[i] = thread[i]->getProxy();
-
-        execContexts.push_back(system->execContexts[i]);
-#else
-        if (i < params.workload.size()) {
-            DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
-                    "process is %#x",
-                    i, params.workload[i]->prog_entry, thread[i]);
-            thread[i] = new CPUExecContext(this, i, params.workload[i], i);
-        }
-        assert(params.workload[i]->getMemory() != NULL);
-        assert(mem != NULL);
-        execContexts.push_back(thread[i]->getProxy());
-#endif // !FULL_SYSTEM
-    }
-
-    // Note that this is a hack so that my code which still uses xc-> will
-    // still work.  I should remove this eventually
-    cpuXC = thread[0];
-
     // The stages also need their CPU pointer setup.  However this must be
     // done at the upper level CPU because they have pointers to the upper
     // level CPU, and not this FullO3CPU.
 
+    // Set up Pointers to the activeThreads list for each stage
+    fetch.setActiveThreads(&activeThreads);
+    decode.setActiveThreads(&activeThreads);
+    rename.setActiveThreads(&activeThreads);
+    iew.setActiveThreads(&activeThreads);
+    commit.setActiveThreads(&activeThreads);
+
     // Give each of the stages the time buffer they will use.
     fetch.setTimeBuffer(&timeBuffer);
     decode.setTimeBuffer(&timeBuffer);
@@ -170,6 +158,7 @@ FullO3CPU<Impl>::FullO3CPU(Params &params)
     // Also setup each of the stages' queues.
     fetch.setFetchQueue(&fetchQueue);
     decode.setFetchQueue(&fetchQueue);
+    commit.setFetchQueue(&fetchQueue);
     decode.setDecodeQueue(&decodeQueue);
     rename.setDecodeQueue(&decodeQueue);
     rename.setRenameQueue(&renameQueue);
@@ -178,16 +167,91 @@ FullO3CPU<Impl>::FullO3CPU(Params &params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
+    commit.setIEWStage(&iew);
+    rename.setIEWStage(&iew);
+    rename.setCommitStage(&commit);
+
+    //Make Sure That this a Valid Architeture
+    //@todo: move this up in constructor
+    numThreads = number_of_threads;
+
+#if !FULL_SYSTEM
+    int activeThreads = params->workload.size();
+#else
+    int activeThreads = 1;
+#endif
+
+    assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
+    assert(params->numPhysFloatRegs >= numThreads * TheISA::NumFloatRegs);
+
+    rename.setScoreboard(&scoreboard);
+    iew.setScoreboard(&scoreboard);
+
     // Setup the rename map for whichever stages need it.
-    rename.setRenameMap(&renameMap);
-    iew.setRenameMap(&renameMap);
+    PhysRegIndex lreg_idx = 0;
+    PhysRegIndex freg_idx = params->numPhysIntRegs; //Index to 1 after int regs
+
+    for (int tid=0; tid < numThreads; tid++) {
+        bool bindRegs = (tid <= activeThreads - 1);
+
+        commitRenameMap[tid].init(TheISA::NumIntRegs,
+                                  params->numPhysIntRegs,
+                                  lreg_idx,                   //Index for Logical. Regs
+
+                                  TheISA::NumFloatRegs,
+                                  params->numPhysFloatRegs,
+                                  freg_idx,                   //Index for Float Regs
+
+                                  TheISA::NumMiscRegs,
 
-    // Setup the free list for whichever stages need it.
+                                  TheISA::ZeroReg,
+                                  TheISA::ZeroReg,
+
+                                  tid,
+                                  false);
+
+        renameMap[tid].init(TheISA::NumIntRegs,
+                            params->numPhysIntRegs,
+                            lreg_idx,                   //Index for Logical. Regs
+
+                            TheISA::NumFloatRegs,
+                            params->numPhysFloatRegs,
+                            freg_idx,                   //Index for Float Regs
+
+                            TheISA::NumMiscRegs,
+
+                            TheISA::ZeroReg,
+                            TheISA::ZeroReg,
+
+                            tid,
+                            bindRegs);
+    }
+
+    rename.setRenameMap(renameMap);
+    commit.setRenameMap(commitRenameMap);
+
+    // Give renameMap & rename stage access to the freeList;
+    for (int i=0; i < numThreads; i++) {
+        renameMap[i].setFreeList(&freeList);
+    }
     rename.setFreeList(&freeList);
-    renameMap.setFreeList(&freeList);
+
+    // Setup the page table for whichever stages need it.
+#if !FULL_SYSTEM
+    fetch.setPageTable(pTable);
+    iew.setPageTable(pTable);
+#endif
 
     // Setup the ROB for whichever stages need it.
     commit.setROB(&rob);
+
+    lastRunningCycle = curTick;
+
+    for (int i = 0; i < NumStages; ++i) {
+        stageActive[i] = false;
+    }
+
+    contextSwitch = false;
 }
 
 template <class Impl>
@@ -199,7 +263,58 @@ template <class Impl>
 void
 FullO3CPU<Impl>::fullCPURegStats()
 {
+    BaseFullCPU::regStats();
+
     // Register any of the FullCPU's stats here.
+    timesIdled
+        .name(name() + ".timesIdled")
+        .desc("Number of times that the entire CPU went into an idle state and"
+              " unscheduled itself")
+        .prereq(timesIdled);
+
+    idleCycles
+        .name(name() + ".idleCycles")
+        .desc("Total number of cycles that the CPU has spent unscheduled due "
+              "to idling")
+        .prereq(idleCycles);
+
+    // Number of Instructions simulated
+    // --------------------------------
+    // Should probably be in Base CPU but need templated
+    // MaxThreads so put in here instead
+    committedInsts
+        .init(numThreads)
+        .name(name() + ".committedInsts")
+        .desc("Number of Instructions Simulated");
+
+    totalCommittedInsts
+        .name(name() + ".committedInsts_total")
+        .desc("Number of Instructions Simulated");
+
+    cpi
+        .name(name() + ".cpi")
+        .desc("CPI: Cycles Per Instruction")
+        .precision(6);
+    cpi = simTicks / committedInsts;
+
+    totalCpi
+        .name(name() + ".cpi_total")
+        .desc("CPI: Total CPI of All Threads")
+        .precision(6);
+    totalCpi = simTicks / totalCommittedInsts;
+
+    ipc
+        .name(name() + ".ipc")
+        .desc("IPC: Instructions Per Cycle")
+        .precision(6);
+    ipc =  committedInsts / simTicks;
+
+    totalIpc
+        .name(name() + ".ipc_total")
+        .desc("IPC: Total IPC of All Threads")
+        .precision(6);
+    totalIpc =  totalCommittedInsts / simTicks;
+
 }
 
 template <class Impl>
@@ -208,9 +323,11 @@ FullO3CPU<Impl>::tick()
 {
     DPRINTF(FullCPU, "\n\nFullCPU: Ticking main, FullO3CPU.\n");
 
-    //Tick each of the stages if they're actually running.
-    //Will want to figure out a way to unschedule itself if they're all
-    //going to be idle for a long time.
+    ++numCycles;
+
+    activity = false;
+
+    //Tick each of the stages
     fetch.tick();
 
     decode.tick();
@@ -221,7 +338,11 @@ FullO3CPU<Impl>::tick()
 
     commit.tick();
 
-    // Now advance the time buffers, unless the stage is stalled.
+#if !FULL_SYSTEM
+    doContextSwitch();
+#endif
+
+    // Now advance the time buffers
     timeBuffer.advance();
 
     fetchQueue.advance();
@@ -229,81 +350,310 @@ FullO3CPU<Impl>::tick()
     renameQueue.advance();
     iewQueue.advance();
 
-    if (_status == Running && !tickEvent.scheduled())
+    advanceActivityBuffer();
+
+    if (removeInstsThisCycle) {
+        cleanUpRemovedInsts();
+    }
+
+    if (activityCount && !tickEvent.scheduled()) {
         tickEvent.schedule(curTick + 1);
+    }
+
+#if !FULL_SYSTEM
+    updateThreadPriority();
+#endif
+
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::init()
 {
-    if(!deferRegistration)
-    {
-        this->registerExecContexts();
+    if (deferRegistration) {
+        return;
+    }
+
+    // Set inSyscall so that the CPU doesn't squash when initially
+    // setting up registers.
+    for (int i = 0; i < number_of_threads; ++i)
+        thread[i]->inSyscall = true;
 
+    registerExecContexts();
+
+    // Need to do a copy of the xc->regs into the CPU's regfile so
+    // that it can start properly.
+
+    for (int tid=0; tid < number_of_threads; tid++) {
         // Need to do a copy of the xc->regs into the CPU's regfile so
         // that it can start properly.
 #if FULL_SYSTEM
-        ExecContext *src_xc = system->execContexts[0];
-        TheISA::initCPU(src_xc, src_xc->readCpuId());
+        ExecContext *src_xc = system->execContexts[tid];
 #else
-        ExecContext *src_xc = thread[0]->getProxy();
+        ExecContext *src_xc = thread[tid]->getXCProxy();
 #endif
-        // First loop through the integer registers.
-        for (int i = 0; i < TheISA::NumIntRegs; ++i)
-        {
-            regFile.intRegFile[i] = src_xc->readIntReg(i);
+        // Threads start in the Suspended State
+        if (src_xc->status() != ExecContext::Suspended) {
+            continue;
         }
 
-        // Then loop through the floating point registers.
-        for (int i = 0; i < TheISA::NumFloatRegs; ++i)
-        {
-            regFile.floatRegFile[i].d = src_xc->readFloatRegDouble(i);
-            regFile.floatRegFile[i].q = src_xc->readFloatRegInt(i);
-        }
-/*
-        // Then loop through the misc registers.
-        regFile.miscRegs.fpcr = src_xc->regs.miscRegs.fpcr;
-        regFile.miscRegs.uniq = src_xc->regs.miscRegs.uniq;
-        regFile.miscRegs.lock_flag = src_xc->regs.miscRegs.lock_flag;
-        regFile.miscRegs.lock_addr = src_xc->regs.miscRegs.lock_addr;
-*/
-        // Then finally set the PC and the next PC.
-        regFile.pc = src_xc->readPC();
-        regFile.npc = src_xc->readNextPC();
+#if FULL_SYSTEM
+        TheISA::initCPU(src_xc, src_xc->readCpuId());
+#endif
+    }
+
+    // Clear inSyscall.
+    for (int i = 0; i < number_of_threads; ++i)
+        thread[i]->inSyscall = false;
+
+    // Probably should just make a call to all the stages to init stage,
+    // regardless of whether or not they need it.  Keeps it more independent.
+    fetch.initStage();
+    iew.initStage();
+    rename.initStage();
+    commit.initStage();
+
+    commit.setThreads(thread);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::insertThread(unsigned tid)
+{
+    DPRINTF(FullCPU,"[tid:%i] Initializing thread data");
+    // Will change now that the PC and thread state is internal to the CPU
+    // and not in the CPUExecContext.
+#if 0
+#if FULL_SYSTEM
+    ExecContext *src_xc = system->execContexts[tid];
+#else
+    CPUExecContext *src_xc = thread[tid];
+#endif
+
+    //Bind Int Regs to Rename Map
+    for (int ireg = 0; ireg < TheISA::NumIntRegs; ireg++) {
+        PhysRegIndex phys_reg = freeList.getIntReg();
+
+        renameMap[tid].setEntry(ireg,phys_reg);
+        scoreboard.setReg(phys_reg);
+    }
+
+    //Bind Float Regs to Rename Map
+    for (int freg = 0; freg < TheISA::NumFloatRegs; freg++) {
+        PhysRegIndex phys_reg = freeList.getFloatReg();
+
+        renameMap[tid].setEntry(freg,phys_reg);
+        scoreboard.setReg(phys_reg);
+    }
+
+    //Copy Thread Data Into RegFile
+    this->copyFromXC(tid);
+
+    //Set PC/NPC
+    regFile.pc[tid]  = src_xc->readPC();
+    regFile.npc[tid] = src_xc->readNextPC();
+
+    src_xc->setStatus(ExecContext::Active);
+
+    activateContext(tid,1);
+
+    //Reset ROB/IQ/LSQ Entries
+    commit.rob->resetEntries();
+    iew.resetEntries();
+#endif
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::removeThread(unsigned tid)
+{
+    DPRINTF(FullCPU,"[tid:%i] Removing thread data");
+#if 0
+    //Unbind Int Regs from Rename Map
+    for (int ireg = 0; ireg < TheISA::NumIntRegs; ireg++) {
+        PhysRegIndex phys_reg = renameMap[tid].lookup(ireg);
+
+        scoreboard.unsetReg(phys_reg);
+        freeList.addReg(phys_reg);
+    }
+
+    //Unbind Float Regs from Rename Map
+    for (int freg = 0; freg < TheISA::NumFloatRegs; freg++) {
+        PhysRegIndex phys_reg = renameMap[tid].lookup(freg);
+
+        scoreboard.unsetReg(phys_reg);
+        freeList.addReg(phys_reg);
+    }
+
+    //Copy Thread Data From RegFile
+    /* Fix Me:
+     * Do we really need to do this if we are removing a thread
+     * in the sense that it's finished (exiting)? If the thread is just
+     * being suspended we might...
+     */
+//    this->copyToXC(tid);
+
+    //Squash Throughout Pipeline
+    fetch.squash(0,tid);
+    decode.squash(tid);
+    rename.squash(tid);
+
+    assert(iew.ldstQueue.getCount(tid) == 0);
+
+    //Reset ROB/IQ/LSQ Entries
+    if (activeThreads.size() >= 1) {
+        commit.rob->resetEntries();
+        iew.resetEntries();
+    }
+#endif
+}
+
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activateWhenReady(int tid)
+{
+    DPRINTF(FullCPU,"[tid:%i]: Checking if resources are available for incoming"
+            "(e.g. PhysRegs/ROB/IQ/LSQ) \n",
+            tid);
+
+    bool ready = true;
+
+    if (freeList.numFreeIntRegs() >= TheISA::NumIntRegs) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "Phys. Int. Regs.\n",
+                tid);
+        ready = false;
+    } else if (freeList.numFreeFloatRegs() >= TheISA::NumFloatRegs) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "Phys. Float. Regs.\n",
+                tid);
+        ready = false;
+    } else if (commit.rob->numFreeEntries() >=
+               commit.rob->entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "ROB entries.\n",
+                tid);
+        ready = false;
+    } else if (iew.instQueue.numFreeEntries() >=
+               iew.instQueue.entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "IQ entries.\n",
+                tid);
+        ready = false;
+    } else if (iew.ldstQueue.numFreeEntries() >=
+               iew.ldstQueue.entryAmount(activeThreads.size() + 1)) {
+        DPRINTF(FullCPU,"[tid:%i] Suspending thread due to not enough "
+                "LSQ entries.\n",
+                tid);
+        ready = false;
+    }
+
+    if (ready) {
+        insertThread(tid);
+
+        contextSwitch = false;
+
+        cpuWaitList.remove(tid);
+    } else {
+        suspendContext(tid);
+
+        //blocks fetch
+        contextSwitch = true;
+
+        //do waitlist
+        cpuWaitList.push_back(tid);
     }
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::activateContext(int thread_num, int delay)
+FullO3CPU<Impl>::activateContext(int tid, int delay)
 {
+
     // Needs to set each stage to running as well.
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive == activeThreads.end()) {
+        //May Need to Re-code this if the delay variable is the
+        //delay needed for thread to activate
+        DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
+                tid);
+
+        activeThreads.push_back(tid);
+    }
+
+    assert(_status == Idle);
 
     scheduleTickEvent(delay);
 
+    // Be sure to signal that there's some activity so the CPU doesn't
+    // deschedule itself.
+    activityThisCycle();
+    fetch.wakeFromQuiesce();
+
     _status = Running;
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::suspendContext(int thread_num)
+FullO3CPU<Impl>::suspendContext(int tid)
 {
-    panic("suspendContext unimplemented!");
+    DPRINTF(FullCPU,"[tid: %i]: Suspended ...\n", tid);
+    unscheduleTickEvent();
+    _status = Idle;
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+    }
+*/
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::deallocateContext(int thread_num)
+FullO3CPU<Impl>::deallocateContext(int tid)
 {
-    panic("deallocateContext unimplemented!");
+    DPRINTF(FullCPU,"[tid:%i]: Deallocating ...", tid);
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+
+        removeThread(tid);
+    }
+*/
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::haltContext(int thread_num)
+FullO3CPU<Impl>::haltContext(int tid)
 {
-    panic("haltContext unimplemented!");
+    DPRINTF(FullCPU,"[tid:%i]: Halted ...", tid);
+/*
+    //Remove From Active List, if Active
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive != activeThreads.end()) {
+        DPRINTF(FullCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        activeThreads.erase(isActive);
+
+        removeThread(tid);
+    }
+*/
 }
 
 template <class Impl>
@@ -336,7 +686,6 @@ template <class Impl>
 InstSeqNum
 FullO3CPU<Impl>::getAndIncrementInstSeq()
 {
-    // Hopefully this works right.
     return globalSeqNum++;
 }
 
@@ -398,124 +747,274 @@ FullO3CPU<Impl>::setFloatRegInt(int reg_idx, uint64_t val)
 
 template <class Impl>
 uint64_t
-FullO3CPU<Impl>::readPC()
+FullO3CPU<Impl>::readArchIntReg(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readIntReg(phys_reg);
+}
+
+template <class Impl>
+float
+FullO3CPU<Impl>::readArchFloatRegSingle(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegSingle(phys_reg);
+}
+
+template <class Impl>
+double
+FullO3CPU<Impl>::readArchFloatRegDouble(int reg_idx, unsigned tid)
+{
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegDouble(phys_reg);
+}
+
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readArchFloatRegInt(int reg_idx, unsigned tid)
 {
-    return regFile.readPC();
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    return regFile.readFloatRegInt(phys_reg);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::setNextPC(uint64_t val)
+FullO3CPU<Impl>::setArchIntReg(int reg_idx, uint64_t val, unsigned tid)
 {
-    regFile.setNextPC(val);
+    if (reg_idx == TheISA::ZeroReg) {
+        warn("Setting r31 through ArchIntReg in CPU, cycle %i\n", curTick);
+    }
+
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setIntReg(phys_reg, val);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::setPC(Addr new_PC)
+FullO3CPU<Impl>::setArchFloatRegSingle(int reg_idx, float val, unsigned tid)
 {
-    regFile.setPC(new_PC);
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setFloatRegSingle(phys_reg, val);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::addInst(DynInstPtr &inst)
+FullO3CPU<Impl>::setArchFloatRegDouble(int reg_idx, double val, unsigned tid)
 {
-    instList.push_back(inst);
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+
+    regFile.setFloatRegDouble(phys_reg, val);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::instDone()
+FullO3CPU<Impl>::setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid)
 {
-    // Keep an instruction count.
-    numInsts++;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
 
-    // Check for instruction-count-based events.
-    comInstEventQueue[0]->serviceEvents(numInsts);
+    regFile.setFloatRegInt(phys_reg, val);
+}
+
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readPC(unsigned tid)
+{
+    return commit.readPC(tid);
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeBackInst(DynInstPtr &inst)
+FullO3CPU<Impl>::setPC(Addr new_PC,unsigned tid)
 {
-    DynInstPtr inst_to_delete;
+    commit.setPC(new_PC, tid);
+}
 
-    // Walk through the instruction list, removing any instructions
-    // that were inserted after the given instruction, inst.
-    while (instList.back() != inst)
-    {
-        assert(!instList.empty());
+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readNextPC(unsigned tid)
+{
+    return commit.readNextPC(tid);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setNextPC(uint64_t val,unsigned tid)
+{
+    commit.setNextPC(val, tid);
+}
 
-        // Obtain the pointer to the instruction.
-        inst_to_delete = instList.back();
+template <class Impl>
+typename FullO3CPU<Impl>::ListIt
+FullO3CPU<Impl>::addInst(DynInstPtr &inst)
+{
+    instList.push_back(inst);
 
-        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
-                inst_to_delete->seqNum, inst_to_delete->readPC());
+    return --(instList.end());
+}
 
-        // Remove the instruction from the list.
-        instList.pop_back();
+template <class Impl>
+void
+FullO3CPU<Impl>::instDone(unsigned tid)
+{
+    // Keep an instruction count.
+    thread[tid]->numInst++;
+    thread[tid]->numInsts++;
+    committedInsts[tid]++;
+    totalCommittedInsts++;
 
-        // Mark it as squashed.
-        inst_to_delete->setSquashed();
-    }
+    // Check for instruction-count-based events.
+    comInstEventQueue[tid]->serviceEvents(thread[tid]->numInst);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::addToRemoveList(DynInstPtr &inst)
+{
+    removeInstsThisCycle = true;
+
+    removeList.push(inst->getInstListIt());
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    DynInstPtr inst_to_remove;
+    unsigned tid = inst->threadNumber;
 
-    // The front instruction should be the same one being asked to be removed.
-    assert(instList.front() == inst);
+    DPRINTF(FullCPU, "FullCPU: Removing committed instruction [tid:%i] PC %#x "
+            "[sn:%lli]\n",
+            tid, inst->readPC(), inst->seqNum);
 
-    // Remove the front instruction.
-    inst_to_remove = inst;
-    instList.pop_front();
+    removeInstsThisCycle = true;
 
-    DPRINTF(FullCPU, "FullCPU: Removing committed instruction %#x, PC %#x\n",
-            inst_to_remove, inst_to_remove->readPC());
+    // Remove the front instruction.
+    removeList.push(inst->getInstListIt());
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeInstsNotInROB()
+FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid)
 {
-    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
-            "list.\n");
+    DPRINTF(FullCPU, "FullCPU: Thread %i: Deleting instructions from instruction"
+            " list.\n", tid);
+
+    ListIt end_it;
+
+    bool rob_empty = false;
+
+    if (instList.empty()) {
+        return;
+    } else if (rob.isEmpty(/*tid*/)) {
+        DPRINTF(FullCPU, "FullCPU: ROB is empty, squashing all insts.\n");
+        end_it = instList.begin();
+        rob_empty = true;
+    } else {
+        end_it = (rob.readTailInst(tid))->getInstListIt();
+        DPRINTF(FullCPU, "FullCPU: ROB is not empty, squashing insts not in ROB.\n");
+    }
+
+    removeInstsThisCycle = true;
+
+    ListIt inst_it = instList.end();
+
+    inst_it--;
+
+    // Walk through the instruction list, removing any instructions
+    // that were inserted after the given instruction iterator, end_it.
+    while (inst_it != end_it) {
+        assert(!instList.empty());
+
+        bool break_loop = (inst_it == instList.begin());
+
+        squashInstIt(inst_it, tid);
+
+        inst_it--;
 
-    DynInstPtr rob_tail = rob.readTailInst();
+        if (break_loop)
+            break;
+    }
 
-    removeBackInst(rob_tail);
+    // If the ROB was empty, then we actually need to remove the first
+    // instruction as well.
+    if (rob_empty) {
+        squashInstIt(inst_it, tid);
+    }
 }
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num)
+FullO3CPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num,
+                                  unsigned tid)
 {
+    assert(!instList.empty());
+
+    removeInstsThisCycle = true;
+
+    ListIt inst_iter = instList.end();
+
+    inst_iter--;
+
     DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
-            "list.\n");
+            "list that are from [tid:%i] and above [sn:%lli] (end=%lli).\n",
+            tid, seq_num, (*inst_iter)->seqNum);
 
-    DynInstPtr inst_to_delete;
+    while ((*inst_iter)->seqNum > seq_num) {
 
-    while (instList.back()->seqNum > seq_num) {
-        assert(!instList.empty());
+        bool break_loop = (inst_iter == instList.begin());
 
-        // Obtain the pointer to the instruction.
-        inst_to_delete = instList.back();
+        squashInstIt(inst_iter, tid);
 
-        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
-                inst_to_delete->seqNum, inst_to_delete->readPC());
+        inst_iter--;
 
-        // Remove the instruction from the list.
-        instList.back() = NULL;
-        instList.pop_back();
+        if (break_loop)
+            break;
+    }
+}
+
+template <class Impl>
+inline void
+FullO3CPU<Impl>::squashInstIt(const ListIt &instIt, const unsigned &tid)
+{
+    if ((*instIt)->threadNumber == tid) {
+        DPRINTF(FullCPU, "FullCPU: Squashing instruction, "
+                "[tid:%i] [sn:%lli] PC %#x\n",
+                (*instIt)->threadNumber,
+                (*instIt)->seqNum,
+                (*instIt)->readPC());
 
         // Mark it as squashed.
-        inst_to_delete->setSquashed();
+        (*instIt)->setSquashed();
+
+        //@todo: Formulate a consistent method for deleting
+        //instructions from the instruction list
+        // Remove the instruction from the list.
+        removeList.push(instIt);
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::cleanUpRemovedInsts()
+{
+    while (!removeList.empty()) {
+        DPRINTF(FullCPU, "FullCPU: Removing instruction, "
+                "[tid:%i] [sn:%lli] PC %#x\n",
+                (*removeList.front())->threadNumber,
+                (*removeList.front())->seqNum,
+                (*removeList.front())->readPC());
+
+        instList.erase(removeList.front());
+
+        removeList.pop();
     }
 
+    removeInstsThisCycle = false;
 }
 
 template <class Impl>
@@ -530,16 +1029,22 @@ void
 FullO3CPU<Impl>::dumpInsts()
 {
     int num = 0;
-    typename list<DynInstPtr>::iterator inst_list_it = instList.begin();
 
-    while (inst_list_it != instList.end())
-    {
-        cprintf("Instruction:%i\nPC:%#x\nSN:%lli\nIssued:%i\nSquashed:%i\n\n",
-                num, (*inst_list_it)->readPC(), (*inst_list_it)->seqNum,
-                (*inst_list_it)->isIssued(), (*inst_list_it)->isSquashed());
+    ListIt inst_list_it = instList.begin();
+
+    cprintf("Dumping Instruction List\n");
+
+    while (inst_list_it != instList.end()) {
+        cprintf("Instruction:%i\nPC:%#x\n[tid:%i]\n[sn:%lli]\nIssued:%i\n"
+                "Squashed:%i\n\n",
+                num, (*inst_list_it)->readPC(), (*inst_list_it)->threadNumber,
+                (*inst_list_it)->seqNum, (*inst_list_it)->isIssued(),
+                (*inst_list_it)->isSquashed());
         inst_list_it++;
         ++num;
     }
+
+
 }
 
 template <class Impl>
@@ -549,5 +1054,139 @@ FullO3CPU<Impl>::wakeDependents(DynInstPtr &inst)
     iew.wakeDependents(inst);
 }
 
+template <class Impl>
+void
+FullO3CPU<Impl>::wakeCPU()
+{
+    if (activityCount || tickEvent.scheduled()) {
+        return;
+    }
+
+    idleCycles += curTick - lastRunningCycle;
+
+    tickEvent.schedule(curTick);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activityThisCycle()
+{
+    if (activityBuffer[0]) {
+        return;
+    }
+
+    activityBuffer[0] = true;
+    activity = true;
+    ++activityCount;
+
+    DPRINTF(Activity, "Activity: %i\n", activityCount);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::advanceActivityBuffer()
+{
+    if (activityBuffer[-5]) {
+        --activityCount;
+
+        assert(activityCount >= 0);
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+
+        if (activityCount == 0) {
+            DPRINTF(FullCPU, "No activity left, going to idle!\n");
+            lastRunningCycle = curTick;
+            timesIdled++;
+        }
+    }
+
+    activityBuffer.advance();
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::activateStage(const StageIdx idx)
+{
+    if (!stageActive[idx]) {
+        ++activityCount;
+
+        stageActive[idx] = true;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already active.\n", idx);
+    }
+
+    // @todo: Number is hardcoded for now.  Replace with parameter.
+    assert(activityCount < 15);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::deactivateStage(const StageIdx idx)
+{
+    if (stageActive[idx]) {
+        --activityCount;
+
+        stageActive[idx] = false;
+
+        DPRINTF(Activity, "Activity: %i\n", activityCount);
+    } else {
+        DPRINTF(Activity, "Stage %i already inactive.\n", idx);
+    }
+
+    assert(activityCount >= 0);
+}
+
+template <class Impl>
+int
+FullO3CPU<Impl>::getFreeTid()
+{
+    for (int i=0; i < numThreads; i++) {
+        if (!tids[i]) {
+            tids[i] = true;
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::doContextSwitch()
+{
+    if (contextSwitch) {
+
+        //ADD CODE TO DEACTIVE THREAD HERE (???)
+
+        for (int tid=0; tid < cpuWaitList.size(); tid++) {
+            activateWhenReady(tid);
+        }
+
+        if (cpuWaitList.size() == 0)
+            contextSwitch = true;
+    }
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::updateThreadPriority()
+{
+    if (activeThreads.size() > 1)
+    {
+        //DEFAULT TO ROUND ROBIN SCHEME
+        //e.g. Move highest priority to end of thread list
+        list<unsigned>::iterator list_begin = activeThreads.begin();
+        list<unsigned>::iterator list_end   = activeThreads.end();
+
+        unsigned high_thread = *list_begin;
+
+        activeThreads.erase(list_begin);
+
+        activeThreads.push_back(high_thread);
+    }
+}
+
 // Forward declaration of FullO3CPU.
 template class FullO3CPU<AlphaSimpleImpl>;
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 6577e46e4..91eaf9d6f 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -26,18 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//Todo: Add in a lot of the functions that are ISA specific.  Also define
-//the functions that currently exist within the base cpu class.  Define
-//everything for the simobject stuff so it can be serialized and
-//instantiated, add in debugging statements everywhere.  Have CPU schedule
-//itself properly.  Threads!
-// Avoid running stages and advancing queues if idle/stalled.
-
-#ifndef __CPU_O3_CPU_FULL_CPU_HH__
-#define __CPU_O3_CPU_FULL_CPU_HH__
+#ifndef __CPU_O3_FULL_CPU_HH__
+#define __CPU_O3_FULL_CPU_HH__
 
 #include <iostream>
 #include <list>
+#include <queue>
+#include <set>
 #include <vector>
 
 #include "base/statistics.hh"
@@ -47,10 +42,12 @@
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/cpu_policy.hh"
+#include "cpu/o3/scoreboard.hh"
+#include "cpu/o3/thread_state.hh"
 #include "sim/process.hh"
 
 class ExecContext;
-class FunctionalMemory;
+class MemInterface;
 class Process;
 
 class BaseFullCPU : public BaseCPU
@@ -59,11 +56,9 @@ class BaseFullCPU : public BaseCPU
   public:
     typedef BaseCPU::Params Params;
 
-#if FULL_SYSTEM
-    BaseFullCPU(Params &params);
-#else
-    BaseFullCPU(Params &params);
-#endif // FULL_SYSTEM
+    BaseFullCPU(Params *params);
+
+    void regStats();
 
   protected:
     int cpu_id;
@@ -78,31 +73,42 @@ class FullO3CPU : public BaseFullCPU
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
+    typedef O3ThreadState<Impl> Thread;
+
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
   public:
     enum Status {
         Running,
         Idle,
         Halted,
-        Blocked // ?
+        Blocked
     };
 
+    /** Overall CPU status. */
     Status _status;
 
   private:
     class TickEvent : public Event
     {
       private:
+        /** Pointer to the CPU. */
         FullO3CPU<Impl> *cpu;
 
       public:
+        /** Constructs a tick event. */
         TickEvent(FullO3CPU<Impl> *c);
+
+        /** Processes a tick event, calling tick() on the CPU. */
         void process();
+        /** Returns the description of the tick event. */
         const char *description();
     };
 
+    /** The tick event used for scheduling CPU ticks. */
     TickEvent tickEvent;
 
-    /// Schedule tick event, regardless of its current state.
+    /** Schedule tick event, regardless of its current state. */
     void scheduleTickEvent(int delay)
     {
         if (tickEvent.squashed())
@@ -111,7 +117,7 @@ class FullO3CPU : public BaseFullCPU
             tickEvent.schedule(curTick + delay);
     }
 
-    /// Unschedule tick event, regardless of its current state.
+    /** Unschedule tick event, regardless of its current state. */
     void unscheduleTickEvent()
     {
         if (tickEvent.scheduled())
@@ -119,21 +125,82 @@ class FullO3CPU : public BaseFullCPU
     }
 
   public:
-    FullO3CPU(Params &params);
+    /** Constructs a CPU with the given parameters. */
+    FullO3CPU(Params *params);
+    /** Destructor. */
     ~FullO3CPU();
 
+    /** Registers statistics. */
     void fullCPURegStats();
 
+    /** Ticks CPU, calling tick() on each stage, and checking the overall
+     *  activity to see if the CPU should deschedule itself.
+     */
     void tick();
 
+    /** Initialize the CPU */
     void init();
 
-    void activateContext(int thread_num, int delay);
-    void suspendContext(int thread_num);
-    void deallocateContext(int thread_num);
-    void haltContext(int thread_num);
+    /** Setup CPU to insert a thread's context */
+    void insertThread(unsigned tid);
+
+    /** Remove all of a thread's context from CPU */
+    void removeThread(unsigned tid);
+
+    /** Count the Total Instructions Committed in the CPU. */
+    virtual Counter totalInstructions() const
+    {
+        Counter total(0);
+
+        for (int i=0; i < thread.size(); i++)
+            total += thread[i]->numInst;
+
+        return total;
+    }
+
+    /** Add Thread to Active Threads List. */
+    void activateContext(int tid, int delay);
+
+    /** Remove Thread from Active Threads List */
+    void suspendContext(int tid);
+
+    /** Remove Thread from Active Threads List &&
+     *  Remove Thread Context from CPU.
+     */
+    void deallocateContext(int tid);
+
+    /** Remove Thread from Active Threads List &&
+     *  Remove Thread Context from CPU.
+     */
+    void haltContext(int tid);
+
+    /** Activate a Thread When CPU Resources are Available. */
+    void activateWhenReady(int tid);
 
+    /** Add or Remove a Thread Context in the CPU. */
+    void doContextSwitch();
+
+    /** Update The Order In Which We Process Threads. */
+    void updateThreadPriority();
+
+    /** Executes a syscall on this cycle.
+     *  ---------------------------------------
+     *  Note: this is a virtual function. CPU-Specific
+     *  functionality defined in derived classes
+     */
+    virtual void syscall(int tid) {}
+
+    /** Check if there are any system calls pending. */
+    void checkSyscalls();
+
+    /** Switches out this CPU.
+     *  @todo: Implement this.
+     */
     void switchOut();
+
+    /** Takes over from another CPU.
+     *  @todo: Implement this.
+     */
     void takeOverFrom(BaseCPU *oldCPU);
 
     /** Get the current instruction sequence number, and increment it. */
@@ -147,21 +214,28 @@ class FullO3CPU : public BaseFullCPU
     bool validDataAddr(Addr addr) { return true; }
 
     /** Get instruction asid. */
-    int getInstAsid()
-    { return regFile.miscRegs.getInstAsid(); }
+    int getInstAsid(unsigned tid)
+    { return regFile.miscRegs[tid].getInstAsid(); }
 
     /** Get data asid. */
-    int getDataAsid()
-    { return regFile.miscRegs.getDataAsid(); }
+    int getDataAsid(unsigned tid)
+    { return regFile.miscRegs[tid].getDataAsid(); }
 #else
-    bool validInstAddr(Addr addr)
-    { return thread[0]->validInstAddr(addr); }
+    /** Check if this address is a valid instruction address. */
+    bool validInstAddr(Addr addr,unsigned tid)
+    { return thread[tid]->validInstAddr(addr); }
+
+    /** Check if this address is a valid data address. */
+    bool validDataAddr(Addr addr,unsigned tid)
+    { return thread[tid]->validDataAddr(addr); }
 
-    bool validDataAddr(Addr addr)
-    { return thread[0]->validDataAddr(addr); }
+    /** Get instruction asid. */
+    int getInstAsid(unsigned tid)
+    { return thread[tid]->asid; }
 
-    int getInstAsid() { return thread[0]->getInstAsid(); }
-    int getDataAsid() { return thread[0]->getDataAsid(); }
+    /** Get data asid. */
+    int getDataAsid(unsigned tid)
+    { return thread[tid]->asid; }
 
 #endif
 
@@ -184,29 +258,40 @@ class FullO3CPU : public BaseFullCPU
 
     void setFloatRegInt(int reg_idx, uint64_t val);
 
-    uint64_t readPC();
+    uint64_t readArchIntReg(int reg_idx, unsigned tid);
+
+    float readArchFloatRegSingle(int reg_idx, unsigned tid);
+
+    double readArchFloatRegDouble(int reg_idx, unsigned tid);
+
+    uint64_t readArchFloatRegInt(int reg_idx, unsigned tid);
+
+    void setArchIntReg(int reg_idx, uint64_t val, unsigned tid);
+
+    void setArchFloatRegSingle(int reg_idx, float val, unsigned tid);
+
+    void setArchFloatRegDouble(int reg_idx, double val, unsigned tid);
+
+    void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid);
 
-    void setNextPC(uint64_t val);
+    uint64_t readPC(unsigned tid);
 
-    void setPC(Addr new_PC);
+    void setPC(Addr new_PC,unsigned tid);
+
+    uint64_t readNextPC(unsigned tid);
+
+    void setNextPC(uint64_t val,unsigned tid);
 
     /** Function to add instruction onto the head of the list of the
      *  instructions.  Used when new instructions are fetched.
      */
-    void addInst(DynInstPtr &inst);
+    ListIt addInst(DynInstPtr &inst);
 
     /** Function to tell the CPU that an instruction has completed. */
-    void instDone();
-
-    /** Remove all instructions in back of the given instruction, but leave
-     *  that instruction in the list.  This is useful in a squash, when there
-     *  are instructions in this list that don't exist in structures such as
-     *  the ROB.  The instruction doesn't have to be the last instruction in
-     *  the list, but will be once this function completes.
-     *  @todo: Remove only up until that inst?  Squashed inst is most likely
-     *  valid.
-     */
-    void removeBackInst(DynInstPtr &inst);
+    void instDone(unsigned tid);
+
+    /** Add Instructions to the CPU Remove List*/
+    void addToRemoveList(DynInstPtr &inst);
 
     /** Remove an instruction from the front of the list.  It is expected
      *  that there are no instructions in front of it (that is, none are older
@@ -218,10 +303,14 @@ class FullO3CPU : public BaseFullCPU
     void removeFrontInst(DynInstPtr &inst);
 
     /** Remove all instructions that are not currently in the ROB. */
-    void removeInstsNotInROB();
+    void removeInstsNotInROB(unsigned tid);
 
     /** Remove all instructions younger than the given sequence number. */
-    void removeInstsUntil(const InstSeqNum &seq_num);
+    void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid);
+
+    inline void squashInstIt(const ListIt &instIt, const unsigned &tid);
+
+    void cleanUpRemovedInsts();
 
     /** Remove all instructions from the list. */
     void removeAllInsts();
@@ -236,43 +325,38 @@ class FullO3CPU : public BaseFullCPU
 
   public:
     /** List of all the instructions in flight. */
-    list<DynInstPtr> instList;
+    std::list<DynInstPtr> instList;
+
+    /** List of all the instructions that will be removed at the end of this
+     *  cycle.
+     */
+    std::queue<ListIt> removeList;
+
+#ifdef DEBUG
+    std::set<InstSeqNum> snList;
+#endif
+
+    /** Records if instructions need to be removed this cycle due to being
+     *  retired or squashed.
+     */
+    bool removeInstsThisCycle;
 
-    //not sure these should be private.
   protected:
     /** The fetch stage. */
     typename CPUPolicy::Fetch fetch;
 
-    /** The fetch stage's status. */
-    typename CPUPolicy::Fetch::Status fetchStatus;
-
     /** The decode stage. */
     typename CPUPolicy::Decode decode;
 
-    /** The decode stage's status. */
-    typename CPUPolicy::Decode::Status decodeStatus;
-
     /** The dispatch stage. */
     typename CPUPolicy::Rename rename;
 
-    /** The dispatch stage's status. */
-    typename CPUPolicy::Rename::Status renameStatus;
-
     /** The issue/execute/writeback stages. */
     typename CPUPolicy::IEW iew;
 
-    /** The issue/execute/writeback stage's status. */
-    typename CPUPolicy::IEW::Status iewStatus;
-
     /** The commit stage. */
     typename CPUPolicy::Commit commit;
 
-    /** The fetch stage's status. */
-    typename CPUPolicy::Commit::Status commitStatus;
-
-    //Might want to just pass these objects in to the constructors of the
-    //appropriate stage.  regFile is in iew, freeList in dispatch, renameMap
-    //in dispatch, and the rob in commit.
     /** The register file. */
     typename CPUPolicy::RegFile regFile;
 
@@ -280,12 +364,33 @@ class FullO3CPU : public BaseFullCPU
     typename CPUPolicy::FreeList freeList;
 
     /** The rename map. */
-    typename CPUPolicy::RenameMap renameMap;
+    typename CPUPolicy::RenameMap renameMap[Impl::MaxThreads];
+
+    /** The commit rename map. */
+    typename CPUPolicy::RenameMap commitRenameMap[Impl::MaxThreads];
 
     /** The re-order buffer. */
     typename CPUPolicy::ROB rob;
 
+    /** Active Threads List */
+    std::list<unsigned> activeThreads;
+
+    /** Integer Register Scoreboard */
+    Scoreboard scoreboard;
+
   public:
+    /** Enum to give each stage a specific index, so when calling
+     *  activateStage() or deactivateStage(), they can specify which stage
+     *  is being activated/deactivated.
+     */
+    enum StageIdx {
+        FetchIdx,
+        DecodeIdx,
+        RenameIdx,
+        IEWIdx,
+        CommitIdx,
+        NumStages };
+
     /** Typedefs from the Impl to get the structs that each of the
      *  time buffers should use.
      */
@@ -314,46 +419,123 @@ class FullO3CPU : public BaseFullCPU
     /** The IEW stage's instruction queue. */
     TimeBuffer<IEWStruct> iewQueue;
 
+  private:
+    /** Time buffer that tracks if any cycles has active communication in them.
+     *  It should be as long as the longest communication latency in the system.
+     *  Each time any time buffer is written, the activity buffer should also
+     *  be written to. The activityBuffer is advanced along with all the other
+     *  time buffers, so it should always have a 1 somewhere in it only if there
+     *  is active communication in a time buffer.
+     */
+    TimeBuffer<bool> activityBuffer;
+
+    /** Tracks how many stages and cycles of time buffer have activity. Stages
+     *  increment this count when they switch to active, and decrement it when
+     *  they switch to inactive. Whenever a cycle that previously had no
+     *  information is written in the time buffer, this is incremented. When
+     *  a cycle that had information exits the time buffer due to age, this
+     *  count is decremented. When the count is 0, there is no activity in the
+     *  CPU, and it can be descheduled.
+     */
+    int activityCount;
+
+    /** Records if there has been activity this cycle. */
+    bool activity;
+
+    /** Records which stages are active/inactive. */
+    bool stageActive[NumStages];
+
   public:
-    /** The temporary exec context to support older accessors. */
-    CPUExecContext *cpuXC;
+    /** Wakes the CPU, rescheduling the CPU if it's not already active. */
+    void wakeCPU();
+    /** Records that there is activity this cycle. */
+    void activityThisCycle();
+    /** Advances the activity buffer, decrementing the activityCount if active
+     *  communication just left the time buffer, and descheduling the CPU if
+     *  there is no activity.
+     */
+    void advanceActivityBuffer();
+    /** Marks a stage as active. */
+    void activateStage(const StageIdx idx);
+    /** Deactivates a stage. */
+    void deactivateStage(const StageIdx idx);
 
-    /** Temporary function to get pointer to exec context. */
-    ExecContext *xcBase()
-    {
-        return thread[0]->getProxy();
-    }
+    /** Gets a free thread id. Use if thread ids change across system. */
+    int getFreeTid();
 
-    CPUExecContext *cpuXCBase()
+  public:
+    /** Temporary function to get pointer to exec context. */
+    ExecContext *xcBase(unsigned tid)
     {
-        return thread[0];
+        return thread[tid]->getXCProxy();
     }
 
+    /** The global sequence number counter. */
     InstSeqNum globalSeqNum;
 
 #if FULL_SYSTEM
+    /** Pointer to the system. */
     System *system;
 
+    /** Pointer to the memory controller. */
     MemoryController *memCtrl;
+    /** Pointer to physical memory. */
     PhysicalMemory *physmem;
-
-    AlphaITB *itb;
-    AlphaDTB *dtb;
-
-//    SWContext *swCtx;
 #endif
-    std::vector<CPUExecContext *> thread;
 
+    // List of all ExecContexts.
+    std::vector<Thread *> thread;
+
+    /** Pointer to memory. */
     FunctionalMemory *mem;
 
+#if 0
+    /** Page table pointer. */
+    PageTable *pTable;
+#endif
+
+    /** Pointer to the icache interface. */
     MemInterface *icacheInterface;
+    /** Pointer to the dcache interface. */
     MemInterface *dcacheInterface;
 
+    /** Whether or not the CPU should defer its registration. */
     bool deferRegistration;
 
-    Counter numInsts;
-
-    Counter funcExeInst;
+    /** Is there a context switch pending? */
+    bool contextSwitch;
+
+    /** Threads Scheduled to Enter CPU */
+    std::list<int> cpuWaitList;
+
+    /** The cycle that the CPU was last running, used for statistics. */
+    Tick lastRunningCycle;
+
+    /** Number of Threads CPU can process */
+    unsigned numThreads;
+
+    /** Mapping for system thread id to cpu id */
+    std::map<unsigned,unsigned> threadMap;
+
+    /** Available thread ids in the cpu*/
+    std::vector<unsigned> tids;
+
+    /** Stat for total number of times the CPU is descheduled. */
+    Stats::Scalar<> timesIdled;
+    /** Stat for total number of cycles the CPU spends descheduled. */
+    Stats::Scalar<> idleCycles;
+    /** Stat for the number of committed instructions per thread. */
+    Stats::Vector<> committedInsts;
+    /** Stat for the total number of committed instructions. */
+    Stats::Scalar<> totalCommittedInsts;
+    /** Stat for the CPI per thread. */
+    Stats::Formula cpi;
+    /** Stat for the total CPI. */
+    Stats::Formula totalCpi;
+    /** Stat for the IPC per thread. */
+    Stats::Formula ipc;
+    /** Stat for the total IPC. */
+    Stats::Formula totalIpc;
 };
 
 #endif
diff --git a/cpu/o3/cpu_policy.hh b/cpu/o3/cpu_policy.hh
index 41f06f81b..52227013e 100644
--- a/cpu/o3/cpu_policy.hh
+++ b/cpu/o3/cpu_policy.hh
@@ -26,13 +26,14 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_CPU_POLICY_HH__
-#define __CPU_O3_CPU_CPU_POLICY_HH__
+#ifndef __CPU_O3_CPU_POLICY_HH__
+#define __CPU_O3_CPU_POLICY_HH__
 
 #include "cpu/o3/bpred_unit.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/inst_queue.hh"
-#include "cpu/o3/ldstq.hh"
+#include "cpu/o3/lsq.hh"
+#include "cpu/o3/lsq_unit.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 #include "cpu/o3/regfile.hh"
 #include "cpu/o3/rename_map.hh"
@@ -57,32 +58,34 @@ struct SimpleCPUPolicy
     typedef ROB<Impl> ROB;
     typedef InstructionQueue<Impl> IQ;
     typedef MemDepUnit<StoreSet, Impl> MemDepUnit;
-    typedef LDSTQ<Impl> LDSTQ;
+    typedef LSQ<Impl> LSQ;
+    typedef LSQUnit<Impl> LSQUnit;
 
-    typedef SimpleFetch<Impl> Fetch;
-    typedef SimpleDecode<Impl> Decode;
-    typedef SimpleRename<Impl> Rename;
-    typedef SimpleIEW<Impl> IEW;
-    typedef SimpleCommit<Impl> Commit;
+
+    typedef DefaultFetch<Impl> Fetch;
+    typedef DefaultDecode<Impl> Decode;
+    typedef DefaultRename<Impl> Rename;
+    typedef DefaultIEW<Impl> IEW;
+    typedef DefaultCommit<Impl> Commit;
 
     /** The struct for communication between fetch and decode. */
-    typedef SimpleFetchSimpleDecode<Impl> FetchStruct;
+    typedef DefaultFetchDefaultDecode<Impl> FetchStruct;
 
     /** The struct for communication between decode and rename. */
-    typedef SimpleDecodeSimpleRename<Impl> DecodeStruct;
+    typedef DefaultDecodeDefaultRename<Impl> DecodeStruct;
 
     /** The struct for communication between rename and IEW. */
-    typedef SimpleRenameSimpleIEW<Impl> RenameStruct;
+    typedef DefaultRenameDefaultIEW<Impl> RenameStruct;
 
     /** The struct for communication between IEW and commit. */
-    typedef SimpleIEWSimpleCommit<Impl> IEWStruct;
+    typedef DefaultIEWDefaultCommit<Impl> IEWStruct;
 
     /** The struct for communication within the IEW stage. */
     typedef IssueStruct<Impl> IssueStruct;
 
     /** The struct for all backwards communication. */
-    typedef TimeBufStruct TimeStruct;
+    typedef TimeBufStruct<Impl> TimeStruct;
 
 };
 
-#endif //__CPU_O3_CPU_CPU_POLICY_HH__
+#endif //__CPU_O3_CPU_POLICY_HH__
diff --git a/cpu/o3/decode.cc b/cpu/o3/decode.cc
index 290648318..b14fbb7a3 100644
--- a/cpu/o3/decode.cc
+++ b/cpu/o3/decode.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/decode_impl.hh"
 
-template class SimpleDecode<AlphaSimpleImpl>;
+template class DefaultDecode<AlphaSimpleImpl>;
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 5b9a0f822..279ff556e 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -26,16 +26,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_SIMPLE_DECODE_HH__
-#define __CPU_O3_CPU_SIMPLE_DECODE_HH__
+#ifndef __CPU_O3_DECODE_HH__
+#define __CPU_O3_DECODE_HH__
 
 #include <queue>
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 
+/**
+ * DefaultDecode class handles both single threaded and SMT decode. Its width is
+ * specified by the parameters; each cycles it tries to decode that many
+ * instructions. Because instructions are actually decoded when the StaticInst
+ * is created, this stage does not do much other than check any PC-relative
+ * branches.
+ */
 template<class Impl>
-class SimpleDecode
+class DefaultDecode
 {
   private:
     // Typedefs from the Impl.
@@ -50,49 +57,126 @@ class SimpleDecode
     typedef typename CPUPol::TimeStruct TimeStruct;
 
   public:
-    // The only time decode will become blocked is if dispatch becomes
-    // blocked, which means IQ or ROB is probably full.
-    enum Status {
+    /** Overall decode stage status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
+     */
+    enum DecodeStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
+        StartSquash,
         Squashing,
         Blocked,
         Unblocking
     };
 
   private:
-    // May eventually need statuses on a per thread basis.
-    Status _status;
+    /** Decode status. */
+    DecodeStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus decodeStatus[Impl::MaxThreads];
 
   public:
-    SimpleDecode(Params &params);
+    /** DefaultDecode constructor. */
+    DefaultDecode(Params *params);
 
+    /** Returns the name of decode. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr);
 
+    /** Sets pointer to time buffer coming from fetch. */
     void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
 
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Ticks decode, processing all input signals and decoding as many
+     * instructions as possible.
+     */
     void tick();
 
-    void decode();
+    /** Determines what to do based on decode's current status.
+     * @param status_change decode() sets this variable if there was a status
+     * change (ie switching from from blocking to unblocking).
+     * @param tid Thread id to decode instructions from.
+     */
+    void decode(bool &status_change, unsigned tid);
+
+    /** Processes instructions from fetch and passes them on to rename.
+     * Decoding of instructions actually happens when they are created in
+     * fetch, so this function mostly checks if PC-relative branches are
+     * correct.
+     */
+    void decodeInsts(unsigned tid);
 
   private:
+    /** Inserts a thread's instructions into the skid buffer, to be decoded
+     * once decode unblocks.
+     */
+    void skidInsert(unsigned tid);
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall decode status based on all of the threads' statuses. */
+    void updateStatus();
+
+    /** Separates instructions from fetch into individual lists of instructions
+     * sorted by thread.
+     */
+    void sortInsts();
+
+    /** Reads all stall signals from the backwards communication timebuffer. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks all input signals and updates decode's status appropriately. */
+    bool checkSignalsAndUpdate(unsigned tid);
+
+    /** Checks all stall signals, and returns if any are true. */
+    bool checkStall(unsigned tid) const;
+
+    /** Returns if there any instructions from fetch on this cycle. */
     inline bool fetchInstsValid();
 
-    void block();
+    /** Switches decode to blocking, and signals back that decode has
+     * become blocked.
+     * @return Returns true if there is a status change.
+     */
+    bool block(unsigned tid);
 
-    inline void unblock();
+    /** Switches decode to unblocking if the skid buffer is empty, and
+     * signals back that decode has unblocked.
+     * @return Returns true if there is a status change.
+     */
+    bool unblock(unsigned tid);
 
-    void squash(DynInstPtr &inst);
+    /** Squashes if there is a PC-relative branch that was predicted
+     * incorrectly. Sends squash information back to fetch.
+     */
+    void squash(DynInstPtr &inst, unsigned tid);
 
   public:
-    // Might want to make squash a friend function.
-    void squash();
+    /** Squashes due to commit signalling a squash. Changes status to
+     * squashing and clears block/unblock signals as needed.
+     */
+    unsigned squash(unsigned tid);
 
   private:
     // Interfaces to objects outside of decode.
@@ -127,10 +211,27 @@ class SimpleDecode
     /** Wire to get fetch's output from fetch queue. */
     typename TimeBuffer<FetchStruct>::wire fromFetch;
 
+    /** Queue of all instructions coming from fetch this cycle. */
+    std::queue<DynInstPtr> insts[Impl::MaxThreads];
+
     /** Skid buffer between fetch and decode. */
-    std::queue<FetchStruct> skidBuffer;
+    std::queue<DynInstPtr> skidBuffer[Impl::MaxThreads];
+
+    /** Variable that tracks if decode has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool rename;
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling decode to stall. */
+    Stalls stalls[Impl::MaxThreads];
 
-    //Consider making these unsigned to avoid any confusion.
     /** Rename to decode delay, in ticks. */
     unsigned renameToDecodeDelay;
 
@@ -146,20 +247,41 @@ class SimpleDecode
     /** The width of decode, in instructions. */
     unsigned decodeWidth;
 
-    /** The instruction that decode is currently on.  It needs to have
-     *  persistent state so that when a stall occurs in the middle of a
-     *  group of instructions, it can restart at the proper instruction.
-     */
-    unsigned numInst;
+    /** Index of instructions being sent to rename. */
+    unsigned toRenameIndex;
+
+    /** number of Active Threads*/
+    unsigned numThreads;
 
+    /** List of active thread ids */
+    std::list<unsigned> *activeThreads;
+
+    /** Number of branches in flight. */
+    unsigned branchCount[Impl::MaxThreads];
+
+    /** Maximum size of the skid buffer. */
+    unsigned skidBufferMax;
+
+    /** Stat for total number of idle cycles. */
     Stats::Scalar<> decodeIdleCycles;
+    /** Stat for total number of blocked cycles. */
     Stats::Scalar<> decodeBlockedCycles;
+    /** Stat for total number of normal running cycles. */
+    Stats::Scalar<> decodeRunCycles;
+    /** Stat for total number of unblocking cycles. */
     Stats::Scalar<> decodeUnblockCycles;
+    /** Stat for total number of squashing cycles. */
     Stats::Scalar<> decodeSquashCycles;
+    /** Stat for number of times a branch mispredict is detected. */
     Stats::Scalar<> decodeBranchMispred;
+    /** Stat for number of times decode detected a non-control instruction
+     * incorrectly predicted as a branch.
+     */
     Stats::Scalar<> decodeControlMispred;
+    /** Stat for total number of decoded instructions. */
     Stats::Scalar<> decodeDecodedInsts;
+    /** Stat for total number of squashed instructions. */
     Stats::Scalar<> decodeSquashedInsts;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_DECODE_HH__
+#endif // __CPU_O3_DECODE_HH__
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index 463f0ddac..f1aea27b4 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -28,22 +28,42 @@
 
 #include "cpu/o3/decode.hh"
 
+using namespace std;
+
 template<class Impl>
-SimpleDecode<Impl>::SimpleDecode(Params &params)
-    : renameToDecodeDelay(params.renameToDecodeDelay),
-      iewToDecodeDelay(params.iewToDecodeDelay),
-      commitToDecodeDelay(params.commitToDecodeDelay),
-      fetchToDecodeDelay(params.fetchToDecodeDelay),
-      decodeWidth(params.decodeWidth),
-      numInst(0)
+DefaultDecode<Impl>::DefaultDecode(Params *params)
+    : renameToDecodeDelay(params->renameToDecodeDelay),
+      iewToDecodeDelay(params->iewToDecodeDelay),
+      commitToDecodeDelay(params->commitToDecodeDelay),
+      fetchToDecodeDelay(params->fetchToDecodeDelay),
+      decodeWidth(params->decodeWidth),
+      numThreads(params->numberOfThreads)
+{
+    DPRINTF(Decode, "decodeWidth=%i.\n", decodeWidth);
+    _status = Inactive;
+
+    for (int i = 0; i < numThreads; ++i) {
+        decodeStatus[i] = Idle;
+
+        stalls[i].rename = false;
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+    }
+
+    // @todo: Make into a parameter
+    skidBufferMax = (fetchToDecodeDelay * params->fetchWidth) + decodeWidth;
+}
+
+template <class Impl>
+std::string
+DefaultDecode<Impl>::name() const
 {
-    DPRINTF(Decode, "Decode: decodeWidth=%i.\n", decodeWidth);
-    _status = Idle;
+    return cpu->name() + ".decode";
 }
 
 template <class Impl>
 void
-SimpleDecode<Impl>::regStats()
+DefaultDecode<Impl>::regStats()
 {
     decodeIdleCycles
         .name(name() + ".decodeIdleCycles")
@@ -53,6 +73,10 @@ SimpleDecode<Impl>::regStats()
         .name(name() + ".decodeBlockedCycles")
         .desc("Number of cycles decode is blocked")
         .prereq(decodeBlockedCycles);
+    decodeRunCycles
+        .name(name() + ".decodeRunCycles")
+        .desc("Number of cycles decode is running")
+        .prereq(decodeRunCycles);
     decodeUnblockCycles
         .name(name() + ".decodeUnblockCycles")
         .desc("Number of cycles decode is unblocking")
@@ -82,17 +106,17 @@ SimpleDecode<Impl>::regStats()
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultDecode<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting CPU pointer.\n");
+    DPRINTF(Decode, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 }
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting time buffer pointer.\n");
+    DPRINTF(Decode, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to write information back to fetch.
@@ -106,9 +130,9 @@ SimpleDecode<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
+DefaultDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting decode queue pointer.\n");
+    DPRINTF(Decode, "Setting decode queue pointer.\n");
     decodeQueue = dq_ptr;
 
     // Setup wire to write information to proper place in decode queue.
@@ -117,260 +141,515 @@ SimpleDecode<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
 template<class Impl>
 void
-SimpleDecode<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultDecode<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
 {
-    DPRINTF(Decode, "Decode: Setting fetch queue pointer.\n");
+    DPRINTF(Decode, "Setting fetch queue pointer.\n");
     fetchQueue = fq_ptr;
 
     // Setup wire to read information from fetch queue.
     fromFetch = fetchQueue->getWire(-fetchToDecodeDelay);
 }
 
+template<class Impl>
+void
+DefaultDecode<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Decode, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template<class Impl>
+bool
+DefaultDecode<Impl>::checkStall(unsigned tid) const
+{
+    bool ret_val = false;
+
+    if (stalls[tid].rename) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom Rename stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].iew) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom IEW stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Decode,"[tid:%i]: Stall fom Commit stage detected.\n", tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
 template<class Impl>
 inline bool
-SimpleDecode<Impl>::fetchInstsValid()
+DefaultDecode<Impl>::fetchInstsValid()
 {
     return fromFetch->size > 0;
 }
 
 template<class Impl>
-void
-SimpleDecode<Impl>::block()
+bool
+DefaultDecode<Impl>::block(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Blocking.\n");
-
-    // Set the status to Blocked.
-    _status = Blocked;
+    DPRINTF(Decode, "[tid:%u]: Blocking.\n", tid);
+
+    // If the decode status is blocked or unblocking then decode has not yet
+    // signalled fetch to unblock. In that case, there is no need to tell
+    // fetch to block.
+    if (decodeStatus[tid] != Blocked &&
+        decodeStatus[tid] != Unblocking) {
+        toFetch->decodeBlock[tid] = true;
+        wroteToTimeBuffer = true;
+    }
 
     // Add the current inputs to the skid buffer so they can be
     // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromFetch);
+    skidInsert(tid);
+
+    if (decodeStatus[tid] != Blocked) {
+        // Set the status to Blocked.
+        decodeStatus[tid] = Blocked;
+        return true;
+    }
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
+    return false;
 }
 
 template<class Impl>
-inline void
-SimpleDecode<Impl>::unblock()
+bool
+DefaultDecode<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Unblocking, going to remove "
-            "instructions from skid buffer.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
-
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toFetch->decodeInfo.stall = true;
-    } else {
-        DPRINTF(Decode, "Decode: Finished unblocking.\n");
-        _status = Running;
+    DPRINTF(Decode, "[tid:%u]: Trying to unblock.\n", tid);
+
+    // Decode is done unblocking only if the skid buffer is empty.
+    if (skidBuffer[tid].empty()) {
+        DPRINTF(Decode, "[tid:%u]: Done unblocking.\n", tid);
+        toFetch->decodeUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+
+        decodeStatus[tid] = Running;
+        return true;
     }
+
+    return false;
 }
 
-// This squash is specifically for when Decode detects a PC-relative branch
-// was predicted incorrectly.
 template<class Impl>
 void
-SimpleDecode<Impl>::squash(DynInstPtr &inst)
+DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Squashing due to incorrect branch prediction "
-                    "detected at decode.\n");
-    Addr new_PC = inst->readNextPC();
-
-    toFetch->decodeInfo.branchMispredict = true;
-    toFetch->decodeInfo.doneSeqNum = inst->seqNum;
-    toFetch->decodeInfo.predIncorrect = true;
-    toFetch->decodeInfo.squash = true;
-    toFetch->decodeInfo.nextPC = new_PC;
-    toFetch->decodeInfo.branchTaken = true;
+    DPRINTF(Decode, "[tid:%i]: Squashing due to incorrect branch prediction "
+            "detected at decode.\n", tid);
+
+    toFetch->decodeInfo[tid].branchMispredict = true;
+    toFetch->decodeInfo[tid].doneSeqNum = inst->seqNum;
+    toFetch->decodeInfo[tid].predIncorrect = true;
+    toFetch->decodeInfo[tid].squash = true;
+    toFetch->decodeInfo[tid].nextPC = inst->readNextPC();
+    toFetch->decodeInfo[tid].branchTaken = true;
+
+    if (decodeStatus[tid] == Blocked ||
+        decodeStatus[tid] == Unblocking) {
+        toFetch->decodeUnblock[tid] = 1;
+    }
 
     // Set status to squashing.
-    _status = Squashing;
+    decodeStatus[tid] = Squashing;
+
+    for (int i=0; i<fromFetch->size; i++) {
+        if (fromFetch->insts[i]->threadNumber == tid &&
+            fromFetch->insts[i]->seqNum > inst->seqNum) {
+            fromFetch->insts[i]->squashed = true;
+        }
+    }
+
+    while (!insts[tid].empty()) {
+        insts[tid].pop();
+    }
 
     // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty()) {
-        skidBuffer.pop();
+    while (!skidBuffer[tid].empty()) {
+        skidBuffer[tid].pop();
     }
 
     // Squash instructions up until this one
-    // Slightly unrealistic!
-    cpu->removeInstsUntil(inst->seqNum);
+    cpu->removeInstsUntil(inst->seqNum, tid);
 }
 
 template<class Impl>
-void
-SimpleDecode<Impl>::squash()
+unsigned
+DefaultDecode<Impl>::squash(unsigned tid)
 {
-    DPRINTF(Decode, "Decode: Squashing.\n");
+    DPRINTF(Decode, "[tid:%i]: Squashing.\n",tid);
+
+    if (decodeStatus[tid] == Blocked ||
+        decodeStatus[tid] == Unblocking) {
+#if !FULL_SYSTEM
+        // In syscall emulation, we can have both a block and a squash due
+        // to a syscall in the same cycle.  This would cause both signals to
+        // be high.  This shouldn't happen in full system.
+        if (toFetch->decodeBlock[tid]) {
+            toFetch->decodeBlock[tid] = 0;
+        } else {
+            toFetch->decodeUnblock[tid] = 1;
+        }
+#else
+        toFetch->decodeUnblock[tid] = 1;
+#endif
+    }
+
     // Set status to squashing.
-    _status = Squashing;
+    decodeStatus[tid] = Squashing;
 
-    // Maybe advance the time buffer?  Not sure what to do in the normal
-    // case.
+    // Go through incoming instructions from fetch and squash them.
+    unsigned squash_count = 0;
+
+    for (int i=0; i<fromFetch->size; i++) {
+        if (fromFetch->insts[i]->threadNumber == tid) {
+            fromFetch->insts[i]->squashed = true;
+            squash_count++;
+        }
+    }
+
+    while (!insts[tid].empty()) {
+        insts[tid].pop();
+    }
 
     // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty())
-    {
-        skidBuffer.pop();
+    while (!skidBuffer[tid].empty()) {
+        skidBuffer[tid].pop();
     }
+
+    return squash_count;
 }
 
 template<class Impl>
 void
-SimpleDecode<Impl>::tick()
+DefaultDecode<Impl>::skidInsert(unsigned tid)
 {
-    // Decode should try to execute as many instructions as its bandwidth
-    // will allow, as long as it is not currently blocked.
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(Decode, "Decode: Not blocked, so attempting to run "
-                        "stage.\n");
-        // Make sure that the skid buffer has something in it if the
-        // status is unblocking.
-        assert(_status == Unblocking ? !skidBuffer.empty() : 1);
+    DynInstPtr inst = NULL;
 
-        decode();
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
 
-        // If the status was unblocking, then instructions from the skid
-        // buffer were used.  Remove those instructions and handle
-        // the rest of unblocking.
-        if (_status == Unblocking) {
-            ++decodeUnblockCycles;
+        insts[tid].pop();
 
-            if (fetchInstsValid()) {
-                // Add the current inputs to the skid buffer so they can be
-                // reprocessed when this stage unblocks.
-                skidBuffer.push(*fromFetch);
-            }
+        assert(tid == inst->threadNumber);
 
-            unblock();
-        }
-    } else if (_status == Blocked) {
-        ++decodeBlockedCycles;
+        DPRINTF(Decode,"Inserting [sn:%lli] PC:%#x into decode skidBuffer %i\n",
+                inst->seqNum, inst->readPC(), inst->threadNumber);
 
-        if (fetchInstsValid()) {
-            block();
-        }
+        skidBuffer[tid].push(inst);
+    }
 
-        if (!fromRename->renameInfo.stall &&
-            !fromIEW->iewInfo.stall &&
-            !fromCommit->commitInfo.stall) {
-            DPRINTF(Decode, "Decode: Stall signals cleared, going to "
-                    "unblock.\n");
-            _status = Unblocking;
+    // Eventually need to enforce this by not letting a thread
+    // fetch past its skidbuffer
+    assert(skidBuffer[tid].size() <= skidBufferMax);
+}
 
-            // Continue to tell previous stage to block until this
-            // stage is done unblocking.
-            toFetch->decodeInfo.stall = true;
-        } else {
-            DPRINTF(Decode, "Decode: Still blocked.\n");
-            toFetch->decodeInfo.stall = true;
+template<class Impl>
+bool
+DefaultDecode<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::updateStatus()
+{
+    bool any_unblocking = false;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (decodeStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
         }
+    }
 
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
+    // Decode will have activity if it's unblocking.
+    if (any_unblocking) {
+        if (_status == Inactive) {
+            _status = Active;
+
+            DPRINTF(Activity, "Activating stage.\n");
+
+            cpu->activateStage(FullCPU::DecodeIdx);
         }
-    } else if (_status == Squashing) {
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-            _status = Running;
-        } else if (fromCommit->commitInfo.squash) {
-            ++decodeSquashCycles;
-
-            squash();
+    } else {
+        // If it's not unblocking, then decode will not have any internal
+        // activity.  Switch it to inactive.
+        if (_status == Active) {
+            _status = Inactive;
+            DPRINTF(Activity, "Deactivating stage.\n");
+
+            cpu->deactivateStage(FullCPU::DecodeIdx);
         }
     }
 }
 
+template <class Impl>
+void
+DefaultDecode<Impl>::sortInsts()
+{
+    int insts_from_fetch = fromFetch->size;
+
+    for (int i=0; i < numThreads; i++)
+        assert(insts[i].empty());
+
+    for (int i = 0; i < insts_from_fetch; ++i) {
+        insts[fromFetch->insts[i]->threadNumber].push(fromFetch->insts[i]);
+    }
+}
+
 template<class Impl>
 void
-SimpleDecode<Impl>::decode()
+DefaultDecode<Impl>::readStallSignals(unsigned tid)
 {
-    // Check time buffer if being told to squash.
-    if (fromCommit->commitInfo.squash) {
-        squash();
-        return;
+    if (fromRename->renameBlock[tid]) {
+        stalls[tid].rename = true;
     }
 
-    // Check time buffer if being told to stall.
-    if (fromRename->renameInfo.stall ||
-        fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall) {
-        block();
-        return;
+    if (fromRename->renameUnblock[tid]) {
+        assert(stalls[tid].rename);
+        stalls[tid].rename = false;
     }
 
-    // Check fetch queue to see if instructions are available.
-    // If no available instructions, do nothing, unless this stage is
-    // currently unblocking.
-    if (!fetchInstsValid() && _status != Unblocking) {
-        DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n");
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
+
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        stalls[tid].iew = false;
+    }
+
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
+    }
+}
+
+template <class Impl>
+bool
+DefaultDecode<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is.
+    // Check stall signals, block if necessary.
+    // If status was blocked
+    //     Check if stall conditions have passed
+    //         if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+
+    // Update the per thread stall statuses.
+    readStallSignals(tid);
+
+    // Check squash signals from commit.
+    if (fromCommit->commitInfo[tid].squash) {
+
+        DPRINTF(Decode, "[tid:%u]: Squashing instructions due to squash "
+                "from commit.\n", tid);
+
+        squash(tid);
+
+        return true;
+    }
+
+    // Check ROB squash signals from commit.
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Decode, "[tid:%]: ROB is still squashing.\n",tid);
+
+        // Continue to squash.
+        decodeStatus[tid] = Squashing;
+
+        return true;
+    }
+
+    if (checkStall(tid)) {
+        return block(tid);
+    }
+
+    if (decodeStatus[tid] == Blocked) {
+        DPRINTF(Decode, "[tid:%u]: Done blocking, switching to unblocking.\n",
+                tid);
+
+        decodeStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        return true;
+    }
+
+    if (decodeStatus[tid] == Squashing) {
+        // Switch status to running if decode isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Decode, "[tid:%u]: Done squashing, switching to running.\n",
+                tid);
+
+        decodeStatus[tid] = Running;
+
+        return false;
+    }
+
+    // If we've reached this point, we have not gotten any signals that
+    // cause decode to change its status.  Decode remains the same as before.
+    return false;
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::tick()
+{
+    wroteToTimeBuffer = false;
+
+    bool status_change = false;
+
+    toRenameIndex = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    sortInsts();
+
+    //Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+    unsigned tid = *threads++;
+
+        DPRINTF(Decode,"Processing [tid:%i]\n",tid);
+        status_change =  checkSignalsAndUpdate(tid) || status_change;
+
+        decode(status_change, tid);
+    }
+
+    if (status_change) {
+        updateStatus();
+    }
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+
+        cpu->activityThisCycle();
+    }
+}
+
+template<class Impl>
+void
+DefaultDecode<Impl>::decode(bool &status_change, unsigned tid)
+{
+    // If status is Running or idle,
+    //     call decodeInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from fetch
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (decodeStatus[tid] == Blocked) {
+        ++decodeBlockedCycles;
+    } else if (decodeStatus[tid] == Squashing) {
+        ++decodeSquashCycles;
+    }
+
+    // Decode should try to decode as many instructions as its bandwidth
+    // will allow, as long as it is not currently blocked.
+    if (decodeStatus[tid] == Running ||
+        decodeStatus[tid] == Idle) {
+        DPRINTF(Decode, "[tid:%u] Not blocked, so attempting to run "
+                "stage.\n",tid);
+
+        decodeInsts(tid);
+    } else if (decodeStatus[tid] == Unblocking) {
+        // Make sure that the skid buffer has something in it if the
+        // status is unblocking.
+        assert(!skidsEmpty());
+
+        // If the status was unblocking, then instructions from the skid
+        // buffer were used.  Remove those instructions and handle
+        // the rest of unblocking.
+        decodeInsts(tid);
+
+        if (fetchInstsValid()) {
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
+        }
+
+        status_change = unblock(tid) || status_change;
+    }
+}
+
+template <class Impl>
+void
+DefaultDecode<Impl>::decodeInsts(unsigned tid)
+{
+    // Instructions can come either from the skid buffer or the list of
+    // instructions coming from fetch, depending on decode's status.
+    int insts_available = decodeStatus[tid] == Unblocking ?
+        skidBuffer[tid].size() : insts[tid].size();
+
+    if (insts_available == 0) {
+        DPRINTF(Decode, "[tid:%u] Nothing to do, breaking out"
+                " early.\n",tid);
         // Should I change the status to idle?
         ++decodeIdleCycles;
         return;
+    } else if (decodeStatus[tid] == Unblocking) {
+        DPRINTF(Decode, "[tid:%u] Unblocking, removing insts from skid "
+                "buffer.\n",tid);
+        ++decodeUnblockCycles;
+    } else if (decodeStatus[tid] == Running) {
+        ++decodeRunCycles;
     }
 
-    // Might be better to use a base DynInst * instead?
     DynInstPtr inst;
 
-    unsigned to_rename_index = 0;
+    std::queue<DynInstPtr>
+        &insts_to_decode = decodeStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
 
-    int insts_available = _status == Unblocking ?
-        skidBuffer.front().size - numInst :
-        fromFetch->size;
+    DPRINTF(Decode, "[tid:%u]: Sending instruction to rename.\n",tid);
 
-    // Debug block...
-#if 0
-    if (insts_available) {
-        DPRINTF(Decode, "Decode: Instructions available.\n");
-    } else {
-        if (_status == Unblocking && skidBuffer.empty()) {
-            DPRINTF(Decode, "Decode: No instructions available, skid buffer "
-                    "empty.\n");
-        } else if (_status != Unblocking &&
-                   !fromFetch->insts[0]) {
-            DPRINTF(Decode, "Decode: No instructions available, fetch queue "
-                    "empty.\n");
-        } else {
-            panic("Decode: No instructions available, unexpected condition!"
-                  "\n");
-        }
-    }
-#endif
+    while (insts_available > 0 && toRenameIndex < decodeWidth) {
+        assert(!insts_to_decode.empty());
 
-    while (insts_available > 0)
-    {
-        DPRINTF(Decode, "Decode: Sending instruction to rename.\n");
+        inst = insts_to_decode.front();
 
-        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
-               fromFetch->insts[numInst];
+        insts_to_decode.pop();
 
-        DPRINTF(Decode, "Decode: Processing instruction %i with PC %#x\n",
-                inst->seqNum, inst->readPC());
+        DPRINTF(Decode, "[tid:%u]: Processing instruction [sn:%lli] with "
+                "PC %#x\n",
+                tid, inst->seqNum, inst->readPC());
 
         if (inst->isSquashed()) {
-            DPRINTF(Decode, "Decode: Instruction %i with PC %#x is "
+            DPRINTF(Decode, "[tid:%u]: Instruction %i with PC %#x is "
                     "squashed, skipping.\n",
-                    inst->seqNum, inst->readPC());
+                    tid, inst->seqNum, inst->readPC());
 
             ++decodeSquashedInsts;
 
-            ++numInst;
             --insts_available;
 
             continue;
         }
 
-
         // Also check if instructions have no source registers.  Mark
         // them as ready to issue at any time.  Not sure if this check
         // should exist here or at a later stage; however it doesn't matter
         // too much for function correctness.
-        // Isn't this handled by the inst queue?
         if (inst->numSrcRegs() == 0) {
             inst->setCanIssue();
         }
@@ -378,9 +657,12 @@ SimpleDecode<Impl>::decode()
         // This current instruction is valid, so add it into the decode
         // queue.  The next instruction may not be valid, so check to
         // see if branches were predicted correctly.
-        toRename->insts[to_rename_index] = inst;
+        toRename->insts[toRenameIndex] = inst;
 
         ++(toRename->size);
+        ++toRenameIndex;
+        ++decodeDecodedInsts;
+        --insts_available;
 
         // Ensure that if it was predicted as a branch, it really is a
         // branch.
@@ -388,38 +670,39 @@ SimpleDecode<Impl>::decode()
             panic("Instruction predicted as a branch!");
 
             ++decodeControlMispred;
+
             // Might want to set some sort of boolean and just do
             // a check at the end
-            squash(inst);
+            squash(inst, inst->threadNumber);
+
             break;
         }
 
         // Go ahead and compute any PC-relative branches.
-
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
-
             inst->setNextPC(inst->branchTarget());
 
             if (inst->mispredicted()) {
                 ++decodeBranchMispred;
+
                 // Might want to set some sort of boolean and just do
                 // a check at the end
-                squash(inst);
+                squash(inst, inst->threadNumber);
+
                 break;
             }
         }
+    }
 
-        // Normally can check if a direct branch has the right target
-        // addr (either the immediate, or the branch PC + 4) and redirect
-        // fetch if it's incorrect.
-
-        // Increment which instruction we're looking at.
-        ++numInst;
-        ++to_rename_index;
-        ++decodeDecodedInsts;
-
-        --insts_available;
+    // If we didn't process all instructions, then we will need to block
+    // and put all those instructions into the skid buffer.
+    if (!insts_to_decode.empty()) {
+        block(tid);
     }
 
-     numInst = 0;
+    // Record that decode has written to the time buffer for activity
+    // tracking.
+    if (toRenameIndex) {
+        wroteToTimeBuffer = true;
+    }
 }
diff --git a/cpu/o3/fetch.cc b/cpu/o3/fetch.cc
index 8ad5e6565..7959416be 100644
--- a/cpu/o3/fetch.cc
+++ b/cpu/o3/fetch.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/fetch_impl.hh"
 
-template class SimpleFetch<AlphaSimpleImpl>;
+template class DefaultFetch<AlphaSimpleImpl>;
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index cc64800d9..f0f3f2745 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -26,11 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: SMT fetch,
-// Add a way to get a stage's current status.
-
-#ifndef __CPU_O3_CPU_SIMPLE_FETCH_HH__
-#define __CPU_O3_CPU_SIMPLE_FETCH_HH__
+#ifndef __CPU_O3_FETCH_HH__
+#define __CPU_O3_FETCH_HH__
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
@@ -39,13 +36,15 @@
 #include "sim/eventq.hh"
 
 /**
- * SimpleFetch class to fetch a single instruction each cycle.  SimpleFetch
- * will stall if there's an Icache miss, but otherwise assumes a one cycle
- * Icache hit.
+ * DefaultFetch class handles both single threaded and SMT fetch. Its width is
+ * specified by the parameters; each cycle it tries to fetch that many
+ * instructions. It supports using a branch predictor to predict direction and
+ * targets.
+ * It supports the idling functionalitiy of the CPU by indicating to the CPU
+ * when it is active and inactive.
  */
-
 template <class Impl>
-class SimpleFetch
+class DefaultFetch
 {
   public:
     /** Typedefs from Impl. */
@@ -55,56 +54,125 @@ class SimpleFetch
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
+    /** Typedefs from the CPU policy. */
     typedef typename CPUPol::BPredUnit BPredUnit;
     typedef typename CPUPol::FetchStruct FetchStruct;
     typedef typename CPUPol::TimeStruct TimeStruct;
 
     /** Typedefs from ISA. */
     typedef TheISA::MachInst MachInst;
+    typedef TheISA::ExtMachInst ExtMachInst;
 
   public:
-    enum Status {
+    /** Overall fetch status. Used to determine if the CPU can deschedule itsef
+     * due to a lack of activity.
+     */
+    enum FetchStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
         Squashing,
         Blocked,
+        Fetching,
+        TrapPending,
+        QuiescePending,
         IcacheMissStall,
         IcacheMissComplete
     };
 
-    // May eventually need statuses on a per thread basis.
-    Status _status;
+    /** Fetching Policy, Add new policies here.*/
+    enum FetchPriority {
+        SingleThread,
+        RoundRobin,
+        Branch,
+        IQ,
+        LSQ
+    };
 
-    bool stalled;
+  private:
+    /** Fetch status. */
+    FetchStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus fetchStatus[Impl::MaxThreads];
+
+    /** Fetch policy. */
+    FetchPriority fetchPolicy;
+
+    /** List that has the threads organized by priority. */
+    std::list<unsigned> priorityList;
 
   public:
     class CacheCompletionEvent : public Event
     {
       private:
-        SimpleFetch *fetch;
+        MemReqPtr req;
+        /** Pointer to fetch. */
+        DefaultFetch *fetch;
+        /** Thread id. */
+//        unsigned threadId;
 
       public:
-        CacheCompletionEvent(SimpleFetch *_fetch);
+        /** Constructs a cache completion event, which tells fetch when the
+         * cache miss is complete.
+         */
+        CacheCompletionEvent(MemReqPtr &_req, DefaultFetch *_fetch);
 
+        /** Processes cache completion event. */
         virtual void process();
+        /** Returns the description of the cache completion event. */
         virtual const char *description();
     };
 
   public:
-    /** SimpleFetch constructor. */
-    SimpleFetch(Params &params);
+    /** DefaultFetch constructor. */
+    DefaultFetch(Params *params);
 
+    /** Returns the name of fetch. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer);
 
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr);
 
-    void processCacheCompletion();
+    /** Sets pointer to page table. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Initialize stage. */
+    void initStage();
+
+    /** Processes cache completion event. */
+    void processCacheCompletion(MemReqPtr &req);
+
+    void wakeFromQuiesce();
 
   private:
+    /** Changes the status of this stage to active, and indicates this to the
+     * CPU.
+     */
+    inline void switchToActive();
+
+    /** Changes the status of this stage to inactive, and indicates this to the
+     * CPU.
+     */
+    inline void switchToInactive();
+
     /**
      * Looks up in the branch predictor to see if the next PC should be
      * either next PC+=MachInst or a branch target.
@@ -120,30 +188,76 @@ class SimpleFetch
      * fault that happened.  Puts the data into the class variable
      * cacheData.
      * @param fetch_PC The PC address that is being fetched from.
+     * @param ret_fault The fault reference that will be set to the result of
+     * the icache access.
+     * @param tid Thread id.
      * @return Any fault that occured.
      */
-    Fault fetchCacheLine(Addr fetch_PC);
+    bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid);
 
-    inline void doSquash(const Addr &new_PC);
+    /** Squashes a specific thread and resets the PC. */
+    inline void doSquash(const Addr &new_PC, unsigned tid);
 
-    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num);
+    /** Squashes a specific thread and resets the PC. Also tells the CPU to
+     * remove any instructions between fetch and decode that should be sqaushed.
+     */
+    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num,
+                          unsigned tid);
+
+    /** Checks if a thread is stalled. */
+    bool checkStall(unsigned tid) const;
+
+    /** Updates overall fetch stage status; to be called at the end of each
+     * cycle. */
+    FetchStatus updateFetchStatus();
 
   public:
-    // Figure out PC vs next PC and how it should be updated
-    void squash(const Addr &new_PC);
+    /** Squashes a specific thread and resets the PC. Also tells the CPU to
+     * remove any instructions that are not in the ROB. The source of this
+     * squash should be the commit stage.
+     */
+    void squash(const Addr &new_PC, unsigned tid);
 
+    /** Ticks the fetch stage, processing all inputs signals and fetching
+     * as many instructions as possible.
+     */
     void tick();
 
-    void fetch();
+    /** Checks all input signals and updates the status as necessary.
+     *  @return: Returns if the status has changed due to input signals.
+     */
+    bool checkSignalsAndUpdate(unsigned tid);
 
-    // Align an address (typically a PC) to the start of an I-cache block.
-    // We fold in the PISA 64- to 32-bit conversion here as well.
+    /** Does the actual fetching of instructions and passing them on to the
+     * next stage.
+     * @param status_change fetch() sets this variable if there was a status
+     * change (ie switching to IcacheMissStall).
+     */
+    void fetch(bool &status_change);
+
+    /** Align a PC to the start of an I-cache block. */
     Addr icacheBlockAlignPC(Addr addr)
     {
         addr = TheISA::realPCToFetchPC(addr);
         return (addr & ~(cacheBlkMask));
     }
 
+  private:
+    /** Returns the appropriate thread to fetch, given the fetch policy. */
+    int getFetchingThread(FetchPriority &fetch_priority);
+
+    /** Returns the appropriate thread to fetch using a round robin policy. */
+    int roundRobin();
+
+    /** Returns the appropriate thread to fetch using the IQ count policy. */
+    int iqCount();
+
+    /** Returns the appropriate thread to fetch using the LSQ count policy. */
+    int lsqCount();
+
+    /** Returns the appropriate thread to fetch using the branch count policy. */
+    int branchCount();
+
   private:
     /** Pointer to the FullCPU. */
     FullCPU *cpu;
@@ -176,8 +290,31 @@ class SimpleFetch
     /** BPredUnit. */
     BPredUnit branchPred;
 
+    Addr PC[Impl::MaxThreads];
+
+    Addr nextPC[Impl::MaxThreads];
+
     /** Memory request used to access cache. */
-    MemReqPtr memReq;
+    MemReqPtr memReq[Impl::MaxThreads];
+
+    /** Variable that tracks if fetch has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Tracks how many instructions has been fetched this cycle. */
+    int numInst;
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool decode;
+        bool rename;
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling fetch to stall. */
+    Stalls stalls[Impl::MaxThreads];
 
     /** Decode to fetch delay, in ticks. */
     unsigned decodeToFetchDelay;
@@ -201,23 +338,56 @@ class SimpleFetch
     Addr cacheBlkMask;
 
     /** The cache line being fetched. */
-    uint8_t *cacheData;
+    uint8_t *cacheData[Impl::MaxThreads];
 
     /** Size of instructions. */
     int instSize;
 
     /** Icache stall statistics. */
-    Counter lastIcacheStall;
+    Counter lastIcacheStall[Impl::MaxThreads];
+
+    /** List of Active Threads */
+    std::list<unsigned> *activeThreads;
+
+    /** Number of threads. */
+    unsigned numThreads;
 
+    /** Number of threads that are actively fetching. */
+    unsigned numFetchingThreads;
+
+    /** Thread ID being fetched. */
+    int threadFetched;
+
+    bool interruptPending;
+
+#if !FULL_SYSTEM
+    /** Page table pointer. */
+//    PageTable *pTable;
+#endif
+
+    // @todo: Consider making these vectors and tracking on a per thread basis.
+    /** Stat for total number of cycles stalled due to an icache miss. */
     Stats::Scalar<> icacheStallCycles;
+    /** Stat for total number of fetched instructions. */
     Stats::Scalar<> fetchedInsts;
+    /** Stat for total number of predicted branches. */
     Stats::Scalar<> predictedBranches;
+    /** Stat for total number of cycles spent fetching. */
     Stats::Scalar<> fetchCycles;
+    /** Stat for total number of cycles spent squashing. */
     Stats::Scalar<> fetchSquashCycles;
+    /** Stat for total number of cycles spent blocked due to other stages in
+     * the pipeline.
+     */
+    Stats::Scalar<> fetchIdleCycles;
     Stats::Scalar<> fetchBlockedCycles;
+    /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
-
-    Stats::Distribution<> fetch_nisn_dist;
+    /** Distribution of number of instructions fetched each cycle. */
+    Stats::Distribution<> fetchNisnDist;
+    Stats::Formula idleRate;
+    Stats::Formula branchRate;
+    Stats::Formula fetchRate;
 };
 
-#endif //__CPU_O3_CPU_SIMPLE_FETCH_HH__
+#endif //__CPU_O3_FETCH_HH__
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 8029fc732..7abc5733f 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -26,66 +26,101 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Remove this later; used only for debugging.
-#define OPCODE(X)                       (X >> 26) & 0x3f
-
 #include "arch/isa_traits.hh"
 #include "sim/byteswap.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/o3/fetch.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "mem/mem_req.hh"
-#include "cpu/o3/fetch.hh"
 
 #include "sim/root.hh"
 
+#if FULL_SYSTEM
+#include "base/remote_gdb.hh"
+#include "mem/functional/memory_control.hh"
+#include "mem/functional/physical.hh"
+#include "sim/system.hh"
+#include "arch/tlb.hh"
+#include "arch/vtophys.hh"
+#else // !FULL_SYSTEM
+#include "mem/functional/functional.hh"
+#endif // FULL_SYSTEM
+
+#include <algorithm>
+
+using namespace std;
+
 template<class Impl>
-SimpleFetch<Impl>::CacheCompletionEvent
-::CacheCompletionEvent(SimpleFetch *_fetch)
-    : Event(&mainEventQueue),
+DefaultFetch<Impl>::CacheCompletionEvent::CacheCompletionEvent(MemReqPtr &_req,
+                                                               DefaultFetch *_fetch)
+    : Event(&mainEventQueue, Delayed_Writeback_Pri),
+      req(_req),
       fetch(_fetch)
 {
+    this->setFlags(Event::AutoDelete);
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::CacheCompletionEvent::process()
+DefaultFetch<Impl>::CacheCompletionEvent::process()
 {
-    fetch->processCacheCompletion();
+    fetch->processCacheCompletion(req);
 }
 
 template<class Impl>
 const char *
-SimpleFetch<Impl>::CacheCompletionEvent::description()
+DefaultFetch<Impl>::CacheCompletionEvent::description()
 {
-    return "SimpleFetch cache completion event";
+    return "DefaultFetch cache completion event";
 }
 
 template<class Impl>
-SimpleFetch<Impl>::SimpleFetch(Params &params)
-    : icacheInterface(params.icacheInterface),
+DefaultFetch<Impl>::DefaultFetch(Params *params)
+    : icacheInterface(params->icacheInterface),
       branchPred(params),
-      decodeToFetchDelay(params.decodeToFetchDelay),
-      renameToFetchDelay(params.renameToFetchDelay),
-      iewToFetchDelay(params.iewToFetchDelay),
-      commitToFetchDelay(params.commitToFetchDelay),
-      fetchWidth(params.fetchWidth)
+      decodeToFetchDelay(params->decodeToFetchDelay),
+      renameToFetchDelay(params->renameToFetchDelay),
+      iewToFetchDelay(params->iewToFetchDelay),
+      commitToFetchDelay(params->commitToFetchDelay),
+      fetchWidth(params->fetchWidth),
+      numThreads(params->numberOfThreads),
+      numFetchingThreads(params->smtNumFetchingThreads),
+      interruptPending(false)
 {
-    DPRINTF(Fetch, "Fetch: Fetch constructor called\n");
-
-    // Set status to idle.
-    _status = Idle;
-
-    // Create a new memory request.
-    memReq = new MemReq();
-    // Not sure of this parameter.  I think it should be based on the
-    // thread number.
-#if !FULL_SYSTEM
-    memReq->asid = 0;
-#else
-    memReq->asid = 0;
-#endif // FULL_SYSTEM
-    memReq->data = new uint8_t[64];
+    if (numThreads > Impl::MaxThreads)
+        fatal("numThreads is not a valid value\n");
+
+    DPRINTF(Fetch, "Fetch constructor called\n");
+
+    // Set fetch stage's status to inactive.
+    _status = Inactive;
+
+    string policy = params->smtFetchPolicy;
+
+    // Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    // Figure out fetch policy
+    if (policy == "singlethread") {
+        fetchPolicy = SingleThread;
+    } else if (policy == "roundrobin") {
+        fetchPolicy = RoundRobin;
+        DPRINTF(Fetch, "Fetch policy set to Round Robin\n");
+    } else if (policy == "branch") {
+        fetchPolicy = Branch;
+        DPRINTF(Fetch, "Fetch policy set to Branch Count\n");
+    } else if (policy == "iqcount") {
+        fetchPolicy = IQ;
+        DPRINTF(Fetch, "Fetch policy set to IQ count\n");
+    } else if (policy == "lsqcount") {
+        fetchPolicy = LSQ;
+        DPRINTF(Fetch, "Fetch policy set to LSQ count\n");
+    } else {
+        fatal("Invalid Fetch Policy. Options Are: {SingleThread,"
+              " RoundRobin,LSQcount,IQcount}\n");
+    }
 
     // Size of cache block.
     cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
@@ -93,16 +128,45 @@ SimpleFetch<Impl>::SimpleFetch(Params &params)
     // Create mask to get rid of offset bits.
     cacheBlkMask = (cacheBlkSize - 1);
 
+    for (int tid=0; tid < numThreads; tid++) {
+
+        fetchStatus[tid] = Running;
+
+        priorityList.push_back(tid);
+
+        // Create a new memory request.
+        memReq[tid] = NULL;
+//        memReq[tid] = new MemReq();
+/*
+        // Need a way of setting this correctly for parallel programs
+        // @todo: Figure out how to properly set asid vs thread_num.
+        memReq[tid]->asid = tid;
+        memReq[tid]->thread_num = tid;
+        memReq[tid]->data = new uint8_t[64];
+*/
+        // Create space to store a cache line.
+        cacheData[tid] = new uint8_t[cacheBlkSize];
+
+        stalls[tid].decode = 0;
+        stalls[tid].rename = 0;
+        stalls[tid].iew = 0;
+        stalls[tid].commit = 0;
+    }
+
     // Get the size of an instruction.
     instSize = sizeof(MachInst);
+}
 
-    // Create space to store a cache line.
-    cacheData = new uint8_t[cacheBlkSize];
+template <class Impl>
+std::string
+DefaultFetch<Impl>::name() const
+{
+    return cpu->name() + ".fetch";
 }
 
 template <class Impl>
 void
-SimpleFetch<Impl>::regStats()
+DefaultFetch<Impl>::regStats()
 {
     icacheStallCycles
         .name(name() + ".icacheStallCycles")
@@ -113,55 +177,88 @@ SimpleFetch<Impl>::regStats()
         .name(name() + ".fetchedInsts")
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
+
     predictedBranches
         .name(name() + ".predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
         .prereq(predictedBranches);
+
     fetchCycles
         .name(name() + ".fetchCycles")
         .desc("Number of cycles fetch has run and was not squashing or"
               " blocked")
         .prereq(fetchCycles);
+
     fetchSquashCycles
         .name(name() + ".fetchSquashCycles")
         .desc("Number of cycles fetch has spent squashing")
         .prereq(fetchSquashCycles);
+
+    fetchIdleCycles
+        .name(name() + ".fetchIdleCycles")
+        .desc("Number of cycles fetch was idle")
+        .prereq(fetchIdleCycles);
+
     fetchBlockedCycles
         .name(name() + ".fetchBlockedCycles")
         .desc("Number of cycles fetch has spent blocked")
         .prereq(fetchBlockedCycles);
+
     fetchedCacheLines
         .name(name() + ".fetchedCacheLines")
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
-    fetch_nisn_dist
+    fetchNisnDist
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
               /* bucket size */ 1)
-        .name(name() + ".FETCH:rate_dist")
+        .name(name() + ".rateDist")
         .desc("Number of instructions fetched each cycle (Total)")
-        .flags(Stats::pdf)
-        ;
+        .flags(Stats::pdf);
+
+    idleRate
+        .name(name() + ".idleRate")
+        .desc("Percent of cycles fetch was idle")
+        .prereq(idleRate);
+    idleRate = fetchIdleCycles * 100 / cpu->numCycles;
+
+    branchRate
+        .name(name() + ".branchRate")
+        .desc("Number of branch fetches per cycle")
+        .flags(Stats::total);
+    branchRate = predictedBranches / cpu->numCycles;
+
+    fetchRate
+        .name(name() + ".rate")
+        .desc("Number of inst fetches per cycle")
+        .flags(Stats::total);
+    fetchRate = fetchedInsts / cpu->numCycles;
 
     branchPred.regStats();
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultFetch<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Setting the CPU pointer.\n");
+    DPRINTF(Fetch, "Setting the CPU pointer.\n");
     cpu = cpu_ptr;
-    // This line will be removed eventually.
-    memReq->xc = cpu->xcBase();
+
+    // Set ExecContexts for Memory Requests
+//    for (int tid=0; tid < numThreads; tid++)
+//        memReq[tid]->xc = cpu->xcBase(tid);
+
+    // Fetch needs to start fetching instructions at the very beginning,
+    // so it must start up in active state.
+    switchToActive();
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
+DefaultFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
 {
-    DPRINTF(Fetch, "Fetch: Setting the time buffer pointer.\n");
+    DPRINTF(Fetch, "Setting the time buffer pointer.\n");
     timeBuffer = time_buffer;
 
     // Create wires to get information from proper places in time buffer.
@@ -173,32 +270,122 @@ SimpleFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
 
 template<class Impl>
 void
-SimpleFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultFetch<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(Fetch, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
+
+template<class Impl>
+void
+DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Setting the fetch queue pointer.\n");
+    DPRINTF(Fetch, "Setting the fetch queue pointer.\n");
     fetchQueue = fq_ptr;
 
     // Create wire to write information to proper place in fetch queue.
     toDecode = fetchQueue->getWire(0);
 }
 
+#if 0
 template<class Impl>
 void
-SimpleFetch<Impl>::processCacheCompletion()
+DefaultFetch<Impl>::setPageTable(PageTable *pt_ptr)
 {
-    DPRINTF(Fetch, "Fetch: Waking up from cache miss.\n");
+    DPRINTF(Fetch, "Setting the page table pointer.\n");
+#if !FULL_SYSTEM
+    pTable = pt_ptr;
+#endif
+}
+#endif
+
+template<class Impl>
+void
+DefaultFetch<Impl>::initStage()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        PC[tid] = cpu->readPC(tid);
+        nextPC[tid] = cpu->readNextPC(tid);
+    }
+}
+
+template<class Impl>
+void
+DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
+{
+    unsigned tid = req->thread_num;
+
+    DPRINTF(Fetch, "[tid:%u] Waking up from cache miss.\n",tid);
 
     // Only change the status if it's still waiting on the icache access
     // to return.
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
-    if (_status == IcacheMissStall)
-        _status = IcacheMissComplete;
+    if (fetchStatus[tid] != IcacheMissStall ||
+        req != memReq[tid])
+        return;
+
+    // Wake up the CPU (if it went to sleep and was waiting on this completion
+    // event).
+    cpu->wakeCPU();
+
+    DPRINTF(Activity, "[tid:%u] Activating fetch due to cache completion\n",
+            tid);
+
+    switchToActive();
+
+    // Only switch to IcacheMissComplete if we're not stalled as well.
+    if (checkStall(tid)) {
+        fetchStatus[tid] = Blocked;
+    } else {
+        fetchStatus[tid] = IcacheMissComplete;
+    }
+
+//    memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
+
+    // Reset the completion event to NULL.
+    memReq[tid] = NULL;
+//    memReq[tid]->completionEvent = NULL;
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::wakeFromQuiesce()
+{
+    DPRINTF(Fetch, "Waking up from quiesce\n");
+    // Hopefully this is safe
+    fetchStatus[0] = Running;
+}
+
+template <class Impl>
+inline void
+DefaultFetch<Impl>::switchToActive()
+{
+    if (_status == Inactive) {
+        DPRINTF(Activity, "Activating stage.\n");
+
+        cpu->activateStage(FullCPU::FetchIdx);
+
+        _status = Active;
+    }
+}
+
+template <class Impl>
+inline void
+DefaultFetch<Impl>::switchToInactive()
+{
+    if (_status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+
+        cpu->deactivateStage(FullCPU::FetchIdx);
+
+        _status = Inactive;
+    }
 }
 
 template <class Impl>
 bool
-SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
+DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 {
     // Do branch prediction check here.
     // A bit of a misnomer...next_PC is actually the current PC until
@@ -211,7 +398,7 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
         return false;
     }
 
-    predict_taken = branchPred.predict(inst, next_PC);
+    predict_taken = branchPred.predict(inst, next_PC, inst->threadNumber);
 
     if (predict_taken) {
         ++predictedBranches;
@@ -221,37 +408,48 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 }
 
 template <class Impl>
-Fault
-SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
+bool
+DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid)
 {
     // Check if the instruction exists within the cache.
     // If it does, then proceed on to read the instruction and the rest
     // of the instructions in the cache line until either the end of the
     // cache line or a predicted taken branch is encountered.
+    Fault fault = NoFault;
 
 #if FULL_SYSTEM
     // Flag to say whether or not address is physical addr.
-    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+    unsigned flags = cpu->inPalMode(fetch_PC) ? PHYSICAL : 0;
 #else
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    Fault fault = NoFault;
+    if (interruptPending && flags == 0) {
+        // Hold off fetch from getting new instructions while an interrupt
+        // is pending.
+        return false;
+    }
 
     // Align the fetch PC so it's at the start of a cache block.
     fetch_PC = icacheBlockAlignPC(fetch_PC);
 
-    // Setup the memReq to do a read of the first isntruction's address.
+    // Setup the memReq to do a read of the first instruction's address.
     // Set the appropriate read size and flags as well.
-    memReq->cmd = Read;
-    memReq->reset(fetch_PC, cacheBlkSize, flags);
+    memReq[tid] = new MemReq();
 
-    // Translate the instruction request.
-    // Should this function be
-    // in the CPU class ?  Probably...ITB/DTB should exist within the
-    // CPU.
+    memReq[tid]->asid = tid;
+    memReq[tid]->thread_num = tid;
+    memReq[tid]->data = new uint8_t[64];
+    memReq[tid]->xc = cpu->xcBase(tid);
+    memReq[tid]->cmd = Read;
+    memReq[tid]->reset(fetch_PC, cacheBlkSize, flags);
 
-    fault = cpu->translateInstReq(memReq);
+    // Translate the instruction request.
+//#if FULL_SYSTEM
+    fault = cpu->translateInstReq(memReq[tid]);
+//#else
+//    fault = pTable->translate(memReq[tid]);
+//#endif
 
     // In the case of faults, the fetch stage may need to stall and wait
     // on what caused the fetch (ITB or Icache miss).
@@ -259,213 +457,416 @@ SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
     // If translation was successful, attempt to read the first
     // instruction.
     if (fault == NoFault) {
+        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
+            DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
+                    "misspeculating path!",
+                    memReq[tid]->paddr);
+            ret_fault = TheISA::genMachineCheckFault();
+            return false;
+        }
+
         DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
-        fault = cpu->mem->read(memReq, cacheData);
+        fault = cpu->mem->read(memReq[tid], cacheData[tid]);
         // This read may change when the mem interface changes.
 
-        fetchedCacheLines++;
-    }
+        // Now do the timing access to see whether or not the instruction
+        // exists within the cache.
+        if (icacheInterface && !icacheInterface->isBlocked()) {
+            DPRINTF(Fetch, "Doing cache access.\n");
+
+            memReq[tid]->completionEvent = NULL;
+
+            memReq[tid]->time = curTick;
+
+            MemAccessResult result = icacheInterface->access(memReq[tid]);
 
-    // Now do the timing access to see whether or not the instruction
-    // exists within the cache.
-    if (icacheInterface && fault == NoFault) {
-        DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
-        memReq->completionEvent = NULL;
+            // If the cache missed, then schedule an event to wake
+            // up this stage once the cache miss completes.
+            // @todo: Possibly allow for longer than 1 cycle cache hits.
+            if (result != MA_HIT && icacheInterface->doEvents()) {
 
-        memReq->time = curTick;
+                memReq[tid]->completionEvent =
+                    new CacheCompletionEvent(memReq[tid], this);
 
-        MemAccessResult result = icacheInterface->access(memReq);
+                lastIcacheStall[tid] = curTick;
 
-        // If the cache missed (in this model functional and timing
-        // memories are different), then schedule an event to wake
-        // up this stage once the cache miss completes.
-        if (result != MA_HIT && icacheInterface->doEvents()) {
-            memReq->completionEvent = new CacheCompletionEvent(this);
+                DPRINTF(Activity, "[tid:%i]: Activity: Stalling due to I-cache "
+                        "miss.\n", tid);
 
-            // How does current model work as far as individual
-            // stages scheduling/unscheduling?
-            // Perhaps have only the main CPU scheduled/unscheduled,
-            // and have it choose what stages to run appropriately.
+                fetchStatus[tid] = IcacheMissStall;
+            } else {
+                DPRINTF(Fetch, "[tid:%i]: I-Cache hit. Doing Instruction "
+                        "read.\n", tid);
 
-            DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
-            _status = IcacheMissStall;
+//                memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
+
+                fetchedCacheLines++;
+            }
+        } else {
+            DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
+            ret_fault = NoFault;
+            return false;
         }
     }
 
-    return fault;
+    ret_fault = fault;
+    return true;
 }
 
 template <class Impl>
 inline void
-SimpleFetch<Impl>::doSquash(const Addr &new_PC)
+DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC);
+    DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x.\n",
+            tid, new_PC);
 
-    cpu->setNextPC(new_PC + instSize);
-    cpu->setPC(new_PC);
+    PC[tid] = new_PC;
+    nextPC[tid] = new_PC + instSize;
 
     // Clear the icache miss if it's outstanding.
-    if (_status == IcacheMissStall && icacheInterface) {
-        DPRINTF(Fetch, "Fetch: Squashing outstanding Icache miss.\n");
-        // @todo: Use an actual thread number here.
-        icacheInterface->squash(0);
+    if (fetchStatus[tid] == IcacheMissStall && icacheInterface) {
+        DPRINTF(Fetch, "[tid:%i]: Squashing outstanding Icache miss.\n",
+                tid);
+//        icacheInterface->squash(tid);
+/*
+        if (memReq[tid]->completionEvent) {
+            if (memReq[tid]->completionEvent->scheduled()) {
+                memReq[tid]->completionEvent->squash();
+            } else {
+                delete memReq[tid]->completionEvent;
+                memReq[tid]->completionEvent = NULL;
+            }
+        }
+*/
+        memReq[tid] = NULL;
+    }
+
+    if (fetchStatus[tid] == TrapPending) {
+        // @todo: Hardcoded number here
+
+        // This is only effective if communication to and from commit
+        // is identical.  If it's faster to commit than it is from
+        // commit to here, then it causes problems.
+
+        bool found_fault = false;
+        for (int i = 0; i > -5; --i) {
+            if (fetchQueue->access(i)->fetchFault) {
+                DPRINTF(Fetch, "[tid:%i]: Fetch used to be in a trap, "
+                        "clearing it.\n",
+                        tid);
+                fetchQueue->access(i)->fetchFault = NoFault;
+                found_fault = true;
+            }
+        }
+        if (!found_fault) {
+            warn("%lli Fault from fetch not found in time buffer!",
+                 curTick);
+        }
+        toDecode->clearFetchFault = true;
     }
 
-    _status = Squashing;
+    fetchStatus[tid] = Squashing;
 
     ++fetchSquashCycles;
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::squashFromDecode(const Addr &new_PC,
-                                    const InstSeqNum &seq_num)
+DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
+                                    const InstSeqNum &seq_num,
+                                    unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squashing from decode.\n");
+    DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);
 
-    doSquash(new_PC);
+    doSquash(new_PC, tid);
 
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
-    cpu->removeInstsUntil(seq_num);
+    cpu->removeInstsUntil(seq_num, tid);
+}
+
+template<class Impl>
+bool
+DefaultFetch<Impl>::checkStall(unsigned tid) const
+{
+    bool ret_val = false;
+
+    if (cpu->contextSwitch) {
+        DPRINTF(Fetch,"[tid:%i]: Stalling for a context switch.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].decode) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Decode stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].rename) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Rename stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].iew) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from IEW stage detected.\n",tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Fetch,"[tid:%i]: Stall from Commit stage detected.\n",tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
+template<class Impl>
+typename DefaultFetch<Impl>::FetchStatus
+DefaultFetch<Impl>::updateFetchStatus()
+{
+    //Check Running
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+
+        unsigned tid = *threads++;
+
+        if (fetchStatus[tid] == Running ||
+            fetchStatus[tid] == Squashing ||
+            fetchStatus[tid] == IcacheMissComplete) {
+
+            if (_status == Inactive) {
+                DPRINTF(Activity, "[tid:%i]: Activating stage.\n",tid);
+
+                if (fetchStatus[tid] == IcacheMissComplete) {
+                    DPRINTF(Activity, "[tid:%i]: Activating fetch due to cache"
+                            "completion\n",tid);
+                }
+
+                cpu->activateStage(FullCPU::FetchIdx);
+            }
+
+            return Active;
+        }
+    }
+
+    // Stage is switching from active to inactive, notify CPU of it.
+    if (_status == Active) {
+        DPRINTF(Activity, "Deactivating stage.\n");
+
+        cpu->deactivateStage(FullCPU::FetchIdx);
+    }
+
+    return Inactive;
 }
 
 template <class Impl>
 void
-SimpleFetch<Impl>::squash(const Addr &new_PC)
+DefaultFetch<Impl>::squash(const Addr &new_PC, unsigned tid)
 {
-    DPRINTF(Fetch, "Fetch: Squash from commit.\n");
+    DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid);
 
-    doSquash(new_PC);
+    doSquash(new_PC, tid);
 
     // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB();
+    cpu->removeInstsNotInROB(tid);
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleFetch<Impl>::tick()
+DefaultFetch<Impl>::tick()
 {
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    bool status_change = false;
+
+    wroteToTimeBuffer = false;
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        // Check the signals for each thread to determine the proper status
+        // for each thread.
+        bool updated_status = checkSignalsAndUpdate(tid);
+        status_change =  status_change || updated_status;
+    }
+
+    DPRINTF(Fetch, "Running stage.\n");
+
+    // Reset the number of the instruction we're fetching.
+    numInst = 0;
+
+    if (fromCommit->commitInfo[0].interruptPending) {
+        interruptPending = true;
+    }
+    if (fromCommit->commitInfo[0].clearInterrupt) {
+        interruptPending = false;
+    }
+
+    for (threadFetched = 0; threadFetched < numFetchingThreads;
+         threadFetched++) {
+        // Fetch each of the actively fetching threads.
+        fetch(status_change);
+    }
+
+    // Record number of instructions fetched this cycle for distribution.
+    fetchNisnDist.sample(numInst);
+
+    if (status_change) {
+        // Change the fetch stage status if there was a status change.
+        _status = updateFetchStatus();
+    }
+
+    // If there was activity this cycle, inform the CPU of it.
+    if (wroteToTimeBuffer || cpu->contextSwitch) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+
+        cpu->activityThisCycle();
+    }
+}
+
+template <class Impl>
+bool
+DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Update the per thread stall statuses.
+    if (fromDecode->decodeBlock[tid]) {
+        stalls[tid].decode = true;
+    }
+
+    if (fromDecode->decodeUnblock[tid]) {
+        assert(stalls[tid].decode);
+        assert(!fromDecode->decodeBlock[tid]);
+        stalls[tid].decode = false;
+    }
+
+    if (fromRename->renameBlock[tid]) {
+        stalls[tid].rename = true;
+    }
+
+    if (fromRename->renameUnblock[tid]) {
+        assert(stalls[tid].rename);
+        assert(!fromRename->renameBlock[tid]);
+        stalls[tid].rename = false;
+    }
+
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
+
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        assert(!fromIEW->iewBlock[tid]);
+        stalls[tid].iew = false;
+    }
+
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        assert(!fromCommit->commitBlock[tid]);
+        stalls[tid].commit = false;
+    }
+
     // Check squash signals from commit.
-    if (fromCommit->commitInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from commit.\n");
+    if (fromCommit->commitInfo[tid].squash) {
+
+        DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
+                "from commit.\n",tid);
 
         // In any case, squash.
-        squash(fromCommit->commitInfo.nextPC);
+        squash(fromCommit->commitInfo[tid].nextPC,tid);
 
         // Also check if there's a mispredict that happened.
-        if (fromCommit->commitInfo.branchMispredict) {
-            branchPred.squash(fromCommit->commitInfo.doneSeqNum,
-                              fromCommit->commitInfo.nextPC,
-                              fromCommit->commitInfo.branchTaken);
+        if (fromCommit->commitInfo[tid].branchMispredict) {
+            branchPred.squash(fromCommit->commitInfo[tid].doneSeqNum,
+                              fromCommit->commitInfo[tid].nextPC,
+                              fromCommit->commitInfo[tid].branchTaken,
+                              tid);
         } else {
-            branchPred.squash(fromCommit->commitInfo.doneSeqNum);
+            branchPred.squash(fromCommit->commitInfo[tid].doneSeqNum,
+                              tid);
         }
 
-        return;
-    } else if (fromCommit->commitInfo.doneSeqNum) {
+        return true;
+    } else if (fromCommit->commitInfo[tid].doneSeqNum) {
         // Update the branch predictor if it wasn't a squashed instruction
-        // that was braodcasted.
-        branchPred.update(fromCommit->commitInfo.doneSeqNum);
+        // that was broadcasted.
+        branchPred.update(fromCommit->commitInfo[tid].doneSeqNum, tid);
     }
 
     // Check ROB squash signals from commit.
-    if (fromCommit->commitInfo.robSquashing) {
-        DPRINTF(Fetch, "Fetch: ROB is still squashing.\n");
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Fetch, "[tid:%u]: ROB is still squashing Thread %u.\n", tid);
 
         // Continue to squash.
-        _status = Squashing;
+        fetchStatus[tid] = Squashing;
 
-        ++fetchSquashCycles;
-        return;
+        return true;
     }
 
     // Check squash signals from decode.
-    if (fromDecode->decodeInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from decode.\n");
+    if (fromDecode->decodeInfo[tid].squash) {
+        DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
+                "from decode.\n",tid);
 
         // Update the branch predictor.
-        if (fromDecode->decodeInfo.branchMispredict) {
-            branchPred.squash(fromDecode->decodeInfo.doneSeqNum,
-                              fromDecode->decodeInfo.nextPC,
-                              fromDecode->decodeInfo.branchTaken);
+        if (fromDecode->decodeInfo[tid].branchMispredict) {
+            branchPred.squash(fromDecode->decodeInfo[tid].doneSeqNum,
+                              fromDecode->decodeInfo[tid].nextPC,
+                              fromDecode->decodeInfo[tid].branchTaken,
+                              tid);
         } else {
-            branchPred.squash(fromDecode->decodeInfo.doneSeqNum);
+            branchPred.squash(fromDecode->decodeInfo[tid].doneSeqNum,
+                              tid);
         }
 
-        if (_status != Squashing) {
-            // Squash unless we're already squashing?
-            squashFromDecode(fromDecode->decodeInfo.nextPC,
-                             fromDecode->decodeInfo.doneSeqNum);
-            return;
+        if (fetchStatus[tid] != Squashing) {
+            // Squash unless we're already squashing
+            squashFromDecode(fromDecode->decodeInfo[tid].nextPC,
+                             fromDecode->decodeInfo[tid].doneSeqNum,
+                             tid);
+
+            return true;
         }
     }
 
-    // Check if any of the stall signals are high.
-    if (fromDecode->decodeInfo.stall ||
-        fromRename->renameInfo.stall ||
-        fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall)
-    {
-        // Block stage, regardless of current status.
-
-        DPRINTF(Fetch, "Fetch: Stalling stage.\n");
-        DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
-                "Commit: %i\n",
-                fromDecode->decodeInfo.stall,
-                fromRename->renameInfo.stall,
-                fromIEW->iewInfo.stall,
-                fromCommit->commitInfo.stall);
+    if (checkStall(tid) && fetchStatus[tid] != IcacheMissStall) {
+        DPRINTF(Fetch, "[tid:%i]: Setting to blocked\n",tid);
 
-        _status = Blocked;
-
-        ++fetchBlockedCycles;
-        return;
-    } else if (_status == Blocked) {
-        // Unblock stage if status is currently blocked and none of the
-        // stall signals are being held high.
-        _status = Running;
+        fetchStatus[tid] = Blocked;
 
-        ++fetchBlockedCycles;
-        return;
+        return true;
     }
 
-    // If fetch has reached this point, then there are no squash signals
-    // still being held high.  Check if fetch is in the squashing state;
-    // if so, fetch can switch to running.
-    // Similarly, there are no blocked signals still being held high.
-    // Check if fetch is in the blocked state; if so, fetch can switch to
-    // running.
-    if (_status == Squashing) {
-        DPRINTF(Fetch, "Fetch: Done squashing, switching to running.\n");
-
-        // Switch status to running
-        _status = Running;
-
-        ++fetchCycles;
-
-        fetch();
-    } else if (_status != IcacheMissStall) {
-        DPRINTF(Fetch, "Fetch: Running stage.\n");
+    if (fetchStatus[tid] == Blocked ||
+        fetchStatus[tid] == Squashing) {
+        // Switch status to running if fetch isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Fetch, "[tid:%i]: Done squashing, switching to running.\n",
+                tid);
 
-        ++fetchCycles;
+        fetchStatus[tid] = Running;
 
-        fetch();
+        return true;
     }
+
+    // If we've reached this point, we have not gotten any signals that
+    // cause fetch to change its status.  Fetch remains the same as before.
+    return false;
 }
 
 template<class Impl>
 void
-SimpleFetch<Impl>::fetch()
+DefaultFetch<Impl>::fetch(bool &status_change)
 {
     //////////////////////////////////////////
     // Start actual fetch
     //////////////////////////////////////////
+    int tid = getFetchingThread(fetchPolicy);
+
+    if (tid == -1) {
+        DPRINTF(Fetch,"There are no more threads available to fetch from.\n");
+
+        // Breaks looping condition in tick()
+        threadFetched = numFetchingThreads;
+        return;
+    }
 
     // The current PC.
-    Addr fetch_PC = cpu->readPC();
+    Addr &fetch_PC = PC[tid];
 
     // Fault code for memory access.
     Fault fault = NoFault;
@@ -473,45 +874,54 @@ SimpleFetch<Impl>::fetch()
     // If returning from the delay of a cache miss, then update the status
     // to running, otherwise do the cache access.  Possibly move this up
     // to tick() function.
-    if (_status == IcacheMissComplete) {
-        DPRINTF(Fetch, "Fetch: Icache miss is complete.\n");
-
-        // Reset the completion event to NULL.
-        memReq->completionEvent = NULL;
-
-        _status = Running;
+    if (fetchStatus[tid] == IcacheMissComplete) {
+        DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n",
+                tid);
+
+        fetchStatus[tid] = Running;
+        status_change = true;
+    } else if (fetchStatus[tid] == Running) {
+        DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read "
+                "instruction, starting at PC %08p.\n",
+                tid, fetch_PC);
+
+        bool fetch_success = fetchCacheLine(fetch_PC, fault, tid);
+        if (!fetch_success)
+            return;
     } else {
-        DPRINTF(Fetch, "Fetch: Attempting to translate and read "
-                       "instruction, starting at PC %08p.\n",
-                fetch_PC);
+        if (fetchStatus[tid] == Blocked) {
+            ++fetchBlockedCycles;
+        } else if (fetchStatus[tid] == Squashing) {
+            ++fetchSquashCycles;
+        }
 
-        fault = fetchCacheLine(fetch_PC);
+        // Status is Idle, Squashing, Blocked, or IcacheMissStall, so
+        // fetch should do nothing.
+        return;
     }
 
-    // If we had a stall due to an icache miss, then return.  It'd
-    // be nicer if this were handled through the kind of fault that
-    // is returned by the function.
-    if (_status == IcacheMissStall) {
+    ++fetchCycles;
+
+    // If we had a stall due to an icache miss, then return.
+    if (fetchStatus[tid] == IcacheMissStall) {
+        status_change = true;
         return;
     }
 
-    // As far as timing goes, the CPU will need to send an event through
-    // the MemReq in order to be woken up once the memory access completes.
-    // Probably have a status on a per thread basis so each thread can
-    // block independently and be woken up independently.
-
     Addr next_PC = fetch_PC;
     InstSeqNum inst_seq;
     MachInst inst;
-    unsigned offset = fetch_PC & cacheBlkMask;
-    unsigned fetched;
+    ExtMachInst ext_inst;
+    // @todo: Fix this hack.
+    unsigned offset = (fetch_PC & cacheBlkMask) & ~3;
 
     if (fault == NoFault) {
         // If the read of the first instruction was successful, then grab the
         // instructions from the rest of the cache line and put them into the
         // queue heading to decode.
 
-        DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n");
+        DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to "
+                "decode.\n",tid);
 
         //////////////////////////
         // Fetch first instruction
@@ -521,12 +931,11 @@ SimpleFetch<Impl>::fetch()
         // ended this fetch block.
         bool predicted_branch = false;
 
-        for (fetched = 0;
+        for (;
              offset < cacheBlkSize &&
-                 fetched < fetchWidth &&
+                 numInst < fetchWidth &&
                  !predicted_branch;
-             ++fetched)
-        {
+             ++numInst) {
 
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
@@ -536,31 +945,40 @@ SimpleFetch<Impl>::fetch()
 
             // Get the instruction from the array of the cache line.
             inst = gtoh(*reinterpret_cast<MachInst *>
-                        (&cacheData[offset]));
+                        (&cacheData[tid][offset]));
+
+            ext_inst = TheISA::makeExtMI(inst, fetch_PC);
 
             // Create a new DynInst from the instruction fetched.
-            DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
+            DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
+                                                 next_PC,
                                                  inst_seq, cpu);
+            instruction->setThread(tid);
+
+            instruction->setASID(tid);
 
-            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                    inst_seq, instruction->readPC());
+            instruction->setState(cpu->thread[tid]);
 
-            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                    OPCODE(inst));
+            DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
+                    "[sn:%lli]\n",
+                    tid, instruction->readPC(), inst_seq);
+
+            DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
+                    tid, instruction->staticInst->disassemble(fetch_PC));
 
             instruction->traceData =
-                Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
+                Trace::getInstRecord(curTick, cpu->xcBase(tid), cpu,
                                      instruction->staticInst,
-                                     instruction->readPC(), 0);
+                                     instruction->readPC(),tid);
 
             predicted_branch = lookupAndUpdateNextPC(instruction, next_PC);
 
             // Add instruction to the CPU's list of instructions.
-            cpu->addInst(instruction);
+            instruction->setInstListIt(cpu->addInst(instruction));
 
             // Write the instruction to the first slot in the queue
             // that heads to decode.
-            toDecode->insts[fetched] = instruction;
+            toDecode->insts[numInst] = instruction;
 
             toDecode->size++;
 
@@ -570,27 +988,36 @@ SimpleFetch<Impl>::fetch()
             // Move to the next instruction, unless we have a branch.
             fetch_PC = next_PC;
 
+            if (instruction->isQuiesce()) {
+                warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+                fetchStatus[tid] = QuiescePending;
+                ++numInst;
+                status_change = true;
+                break;
+            }
+
             offset+= instSize;
         }
+    }
 
-        fetch_nisn_dist.sample(fetched);
+    if (numInst > 0) {
+        wroteToTimeBuffer = true;
     }
 
     // Now that fetching is completed, update the PC to signify what the next
-    // cycle will be.  Might want to move this to the beginning of this
-    // function so that the PC updates at the beginning of everything.
-    // Or might want to leave setting the PC to the main CPU, with fetch
-    // only changing the nextPC (will require correct determination of
-    // next PC).
+    // cycle will be.
     if (fault == NoFault) {
-        DPRINTF(Fetch, "Fetch: Setting PC to %08p.\n", next_PC);
-        cpu->setPC(next_PC);
-        cpu->setNextPC(next_PC + instSize);
+
+        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
+
+
+        PC[tid] = next_PC;
+        nextPC[tid] = next_PC + instSize;
     } else {
         // If the issue was an icache miss, then we can just return and
         // wait until it is handled.
-        if (_status == IcacheMissStall) {
-            return;
+        if (fetchStatus[tid] == IcacheMissStall) {
+            panic("Fetch should have exited prior to this!");
         }
 
         // Handle the fault.
@@ -601,17 +1028,169 @@ SimpleFetch<Impl>::fetch()
         // have it handled by the upper level CPU class which peeks into the
         // time buffer and sees if a squash comes along, in which case it
         // changes the status.
+#if FULL_SYSTEM
+        // Tell the commit stage the fault we had.
+        toDecode->fetchFault = fault;
+        toDecode->fetchFaultSN = cpu->globalSeqNum;
 
-        DPRINTF(Fetch, "Fetch: Blocked, need to handle the trap.\n");
+        DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
-        _status = Blocked;
-#if FULL_SYSTEM
+        fetchStatus[tid] = TrapPending;
+        status_change = true;
+
+        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 //        cpu->trap(fault);
         // Send a signal to the ROB indicating that there's a trap from the
         // fetch stage that needs to be handled.  Need to indicate that
         // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
-        fatal("fault (%d) detected @ PC %08p", fault, cpu->readPC());
+        fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
     }
 }
+
+
+///////////////////////////////////////
+//                                   //
+//  SMT FETCH POLICY MAINTAINED HERE //
+//                                   //
+///////////////////////////////////////
+template<class Impl>
+int
+DefaultFetch<Impl>::getFetchingThread(FetchPriority &fetch_priority)
+{
+    if (numThreads > 1) {
+        switch (fetch_priority) {
+
+          case SingleThread:
+            return 0;
+
+          case RoundRobin:
+            return roundRobin();
+
+          case IQ:
+            return iqCount();
+
+          case LSQ:
+            return lsqCount();
+
+          case Branch:
+            return branchCount();
+
+          default:
+            return -1;
+        }
+    } else {
+        int tid = *((*activeThreads).begin());
+
+        if (fetchStatus[tid] == Running ||
+            fetchStatus[tid] == IcacheMissComplete ||
+            fetchStatus[tid] == Idle) {
+            return tid;
+        } else {
+            return -1;
+        }
+    }
+
+}
+
+
+template<class Impl>
+int
+DefaultFetch<Impl>::roundRobin()
+{
+    list<unsigned>::iterator pri_iter = priorityList.begin();
+    list<unsigned>::iterator end      = priorityList.end();
+
+    int high_pri;
+
+    while (pri_iter != end) {
+        high_pri = *pri_iter;
+
+        assert(high_pri <= numThreads);
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+            fetchStatus[high_pri] == Idle) {
+
+            priorityList.erase(pri_iter);
+            priorityList.push_back(high_pri);
+
+            return high_pri;
+        }
+
+        pri_iter++;
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::iqCount()
+{
+    priority_queue<unsigned> PQ;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        PQ.push(fromIEW->iewInfo[tid].iqCount);
+    }
+
+    while (!PQ.empty()) {
+
+        unsigned high_pri = PQ.top();
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+            fetchStatus[high_pri] == Idle)
+            return high_pri;
+        else
+            PQ.pop();
+
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::lsqCount()
+{
+    priority_queue<unsigned> PQ;
+
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        PQ.push(fromIEW->iewInfo[tid].ldstqCount);
+    }
+
+    while (!PQ.empty()) {
+
+        unsigned high_pri = PQ.top();
+
+        if (fetchStatus[high_pri] == Running ||
+            fetchStatus[high_pri] == IcacheMissComplete ||
+           fetchStatus[high_pri] == Idle)
+            return high_pri;
+        else
+            PQ.pop();
+
+    }
+
+    return -1;
+}
+
+template<class Impl>
+int
+DefaultFetch<Impl>::branchCount()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    return *threads;
+}
diff --git a/cpu/o3/free_list.cc b/cpu/o3/free_list.cc
index 6f0b4be1e..bd0f4f034 100644
--- a/cpu/o3/free_list.cc
+++ b/cpu/o3/free_list.cc
@@ -30,7 +30,8 @@
 
 #include "cpu/o3/free_list.hh"
 
-SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
+SimpleFreeList::SimpleFreeList(unsigned activeThreads,
+                               unsigned _numLogicalIntRegs,
                                unsigned _numPhysicalIntRegs,
                                unsigned _numLogicalFloatRegs,
                                unsigned _numPhysicalFloatRegs)
@@ -40,43 +41,30 @@ SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
       numPhysicalFloatRegs(_numPhysicalFloatRegs),
       numPhysicalRegs(numPhysicalIntRegs + numPhysicalFloatRegs)
 {
-    DPRINTF(FreeList, "FreeList: Creating new free list object.\n");
-
-    // DEBUG stuff.
-    freeIntRegsScoreboard.resize(numPhysicalIntRegs);
-
-    freeFloatRegsScoreboard.resize(numPhysicalRegs);
-
-    for (PhysRegIndex i = 0; i < numLogicalIntRegs; ++i) {
-        freeIntRegsScoreboard[i] = 0;
-    }
+    DPRINTF(FreeList, "Creating new free list object.\n");
 
     // Put all of the extra physical registers onto the free list.  This
     // means excluding all of the base logical registers.
-    for (PhysRegIndex i = numLogicalIntRegs;
+    for (PhysRegIndex i = numLogicalIntRegs * activeThreads;
          i < numPhysicalIntRegs; ++i)
     {
         freeIntRegs.push(i);
-
-        freeIntRegsScoreboard[i] = 1;
-    }
-
-    for (PhysRegIndex i = 0; i < numPhysicalIntRegs + numLogicalFloatRegs;
-         ++i)
-    {
-        freeFloatRegsScoreboard[i] = 0;
     }
 
     // Put all of the extra physical registers onto the free list.  This
     // means excluding all of the base logical registers.  Because the
     // float registers' indices start where the physical registers end,
     // some math must be done to determine where the free registers start.
-    for (PhysRegIndex i = numPhysicalIntRegs + numLogicalFloatRegs;
-         i < numPhysicalRegs; ++i)
+    PhysRegIndex i = numPhysicalIntRegs + (numLogicalFloatRegs * activeThreads);
+
+    for ( ; i < numPhysicalRegs; ++i)
     {
         freeFloatRegs.push(i);
-
-        freeFloatRegsScoreboard[i] = 1;
     }
 }
 
+std::string
+SimpleFreeList::name() const
+{
+    return "cpu.freelist";
+}
diff --git a/cpu/o3/free_list.hh b/cpu/o3/free_list.hh
index 0b85dba1e..29e84cd44 100644
--- a/cpu/o3/free_list.hh
+++ b/cpu/o3/free_list.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_FREE_LIST_HH__
-#define __CPU_O3_CPU_FREE_LIST_HH__
+#ifndef __CPU_O3_FREE_LIST_HH__
+#define __CPU_O3_FREE_LIST_HH__
 
 #include <iostream>
 #include <queue>
@@ -45,10 +45,9 @@
  * other classes, it assumes that the indices for the floating point
  * registers starts after the integer registers end.  Hence the variable
  * numPhysicalIntRegs is logically equivalent to the baseFP dependency.
- * Note that
- * while this most likely should be called FreeList, the name "FreeList"
- * is used in a typedef within the CPU Policy, and therefore no class
- * can be named simply "FreeList".
+ * Note that while this most likely should be called FreeList, the name
+ * "FreeList" is used in a typedef within the CPU Policy, and therefore no
+ * class can be named simply "FreeList".
  * @todo: Give a better name to the base FP dependency.
  */
 class SimpleFreeList
@@ -75,36 +74,51 @@ class SimpleFreeList
     /** Total number of physical registers. */
     int numPhysicalRegs;
 
-    /** DEBUG stuff below. */
-    std::vector<int> freeIntRegsScoreboard;
-
-    std::vector<bool> freeFloatRegsScoreboard;
-
   public:
-    SimpleFreeList(unsigned _numLogicalIntRegs,
+    /** Constructs a free list.
+     *  @param activeThreads Number of active threads.
+     *  @param _numLogicalIntRegs Number of logical integer registers.
+     *  @param _numPhysicalIntRegs Number of physical integer registers.
+     *  @param _numLogicalFloatRegs Number of logical fp registers.
+     *  @param _numPhysicalFloatRegs Number of physical fp registers.
+     */
+    SimpleFreeList(unsigned activeThreads,
+                   unsigned _numLogicalIntRegs,
                    unsigned _numPhysicalIntRegs,
                    unsigned _numLogicalFloatRegs,
                    unsigned _numPhysicalFloatRegs);
 
+    /** Gives the name of the freelist. */
+    std::string name() const;
+
+    /** Gets a free integer register. */
     inline PhysRegIndex getIntReg();
 
+    /** Gets a free fp register. */
     inline PhysRegIndex getFloatReg();
 
+    /** Adds a register back to the free list. */
     inline void addReg(PhysRegIndex freed_reg);
 
+    /** Adds an integer register back to the free list. */
     inline void addIntReg(PhysRegIndex freed_reg);
 
+    /** Adds a fp register back to the free list. */
     inline void addFloatReg(PhysRegIndex freed_reg);
 
+    /** Checks if there are any free integer registers. */
     bool hasFreeIntRegs()
     { return !freeIntRegs.empty(); }
 
+    /** Checks if there are any free fp registers. */
     bool hasFreeFloatRegs()
     { return !freeFloatRegs.empty(); }
 
+    /** Returns the number of free integer registers. */
     int numFreeIntRegs()
     { return freeIntRegs.size(); }
 
+    /** Returns the number of free fp registers. */
     int numFreeFloatRegs()
     { return freeFloatRegs.size(); }
 };
@@ -112,7 +126,8 @@ class SimpleFreeList
 inline PhysRegIndex
 SimpleFreeList::getIntReg()
 {
-    DPRINTF(Rename, "FreeList: Trying to get free integer register.\n");
+    DPRINTF(FreeList, "Trying to get free integer register.\n");
+
     if (freeIntRegs.empty()) {
         panic("No free integer registers!");
     }
@@ -121,17 +136,14 @@ SimpleFreeList::getIntReg()
 
     freeIntRegs.pop();
 
-    // DEBUG
-    assert(freeIntRegsScoreboard[free_reg]);
-    freeIntRegsScoreboard[free_reg] = 0;
-
     return(free_reg);
 }
 
 inline PhysRegIndex
 SimpleFreeList::getFloatReg()
 {
-    DPRINTF(Rename, "FreeList: Trying to get free float register.\n");
+    DPRINTF(FreeList, "Trying to get free float register.\n");
+
     if (freeFloatRegs.empty()) {
         panic("No free integer registers!");
     }
@@ -140,42 +152,28 @@ SimpleFreeList::getFloatReg()
 
     freeFloatRegs.pop();
 
-    // DEBUG
-    assert(freeFloatRegsScoreboard[free_reg]);
-    freeFloatRegsScoreboard[free_reg] = 0;
-
     return(free_reg);
 }
 
 inline void
 SimpleFreeList::addReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing register %i.\n", freed_reg);
+    DPRINTF(FreeList,"Freeing register %i.\n", freed_reg);
     //Might want to add in a check for whether or not this register is
     //already in there.  A bit vector or something similar would be useful.
     if (freed_reg < numPhysicalIntRegs) {
-        freeIntRegs.push(freed_reg);
-
-        // DEBUG
-        assert(freeIntRegsScoreboard[freed_reg] == false);
-        freeIntRegsScoreboard[freed_reg] = 1;
+        if (freed_reg != TheISA::ZeroReg)
+            freeIntRegs.push(freed_reg);
     } else if (freed_reg < numPhysicalRegs) {
-        freeFloatRegs.push(freed_reg);
-
-        // DEBUG
-        assert(freeFloatRegsScoreboard[freed_reg] == false);
-        freeFloatRegsScoreboard[freed_reg] = 1;
+        if (freed_reg != (TheISA::ZeroReg + numPhysicalIntRegs))
+            freeFloatRegs.push(freed_reg);
     }
 }
 
 inline void
 SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing int register %i.\n", freed_reg);
-
-    // DEBUG
-    assert(!freeIntRegsScoreboard[freed_reg]);
-    freeIntRegsScoreboard[freed_reg] = 1;
+    DPRINTF(FreeList,"Freeing int register %i.\n", freed_reg);
 
     freeIntRegs.push(freed_reg);
 }
@@ -183,13 +181,9 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
 inline void
 SimpleFreeList::addFloatReg(PhysRegIndex freed_reg)
 {
-    DPRINTF(Rename, "Freelist: Freeing float register %i.\n", freed_reg);
-
-    // DEBUG
-    assert(!freeFloatRegsScoreboard[freed_reg]);
-    freeFloatRegsScoreboard[freed_reg] = 1;
+    DPRINTF(FreeList,"Freeing float register %i.\n", freed_reg);
 
     freeFloatRegs.push(freed_reg);
 }
 
-#endif // __CPU_O3_CPU_FREE_LIST_HH__
+#endif // __CPU_O3_FREE_LIST_HH__
diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
new file mode 100644
index 000000000..9b6ac15d9
--- /dev/null
+++ b/cpu/o3/fu_pool.cc
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sstream>
+
+#include "cpu/o3/fu_pool.hh"
+#include "encumbered/cpu/full/fu_pool.hh"
+#include "sim/builder.hh"
+
+using namespace std;
+
+////////////////////////////////////////////////////////////////////////////
+//
+//  A pool of function units
+//
+
+inline void
+FUPool::FUIdxQueue::addFU(int fu_idx)
+{
+    funcUnitsIdx.push_back(fu_idx);
+    ++size;
+}
+
+inline int
+FUPool::FUIdxQueue::getFU()
+{
+    int retval = funcUnitsIdx[idx++];
+
+    if (idx == size)
+        idx = 0;
+
+    return retval;
+}
+
+FUPool::~FUPool()
+{
+    fuListIterator i = funcUnits.begin();
+    fuListIterator end = funcUnits.end();
+    for (; i != end; ++i)
+        delete *i;
+}
+
+
+// Constructor
+FUPool::FUPool(string name, vector<FUDesc *> paramList)
+    : SimObject(name)
+{
+    numFU = 0;
+
+    funcUnits.clear();
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        maxOpLatencies[i] = 0;
+        maxIssueLatencies[i] = 0;
+    }
+
+    //
+    //  Iterate through the list of FUDescData structures
+    //
+    for (FUDDiterator i = paramList.begin(); i != paramList.end(); ++i) {
+
+        //
+        //  Don't bother with this if we're not going to create any FU's
+        //
+        if ((*i)->number) {
+            //
+            //  Create the FuncUnit object from this structure
+            //   - add the capabilities listed in the FU's operation
+            //     description
+            //
+            //  We create the first unit, then duplicate it as needed
+            //
+            FuncUnit *fu = new FuncUnit;
+
+            OPDDiterator j = (*i)->opDescList.begin();
+            OPDDiterator end = (*i)->opDescList.end();
+            for (; j != end; ++j) {
+                // indicate that this pool has this capability
+                capabilityList.set((*j)->opClass);
+
+                // Add each of the FU's that will have this capability to the
+                // appropriate queue.
+                for (int k = 0; k < (*i)->number; ++k)
+                    fuPerCapList[(*j)->opClass].addFU(numFU + k);
+
+                // indicate that this FU has the capability
+                fu->addCapability((*j)->opClass, (*j)->opLat, (*j)->issueLat);
+
+                if ((*j)->opLat > maxOpLatencies[(*j)->opClass])
+                    maxOpLatencies[(*j)->opClass] = (*j)->opLat;
+
+                if ((*j)->issueLat > maxIssueLatencies[(*j)->opClass])
+                    maxIssueLatencies[(*j)->opClass] = (*j)->issueLat;
+            }
+
+            numFU++;
+
+            //  Add the appropriate number of copies of this FU to the list
+            ostringstream s;
+
+            s << (*i)->name() << "(0)";
+            fu->name = s.str();
+            funcUnits.push_back(fu);
+
+            for (int c = 1; c < (*i)->number; ++c) {
+                ostringstream s;
+                numFU++;
+                FuncUnit *fu2 = new FuncUnit(*fu);
+
+                s << (*i)->name() << "(" << c << ")";
+                fu2->name = s.str();
+                funcUnits.push_back(fu2);
+            }
+        }
+    }
+
+    unitBusy.resize(numFU);
+
+    for (int i = 0; i < numFU; i++) {
+        unitBusy[i] = false;
+    }
+}
+
+void
+FUPool::annotateMemoryUnits(unsigned hit_latency)
+{
+    maxOpLatencies[MemReadOp] = hit_latency;
+
+    fuListIterator i = funcUnits.begin();
+    fuListIterator iend = funcUnits.end();
+    for (; i != iend; ++i) {
+        if ((*i)->provides(MemReadOp))
+            (*i)->opLatency(MemReadOp) = hit_latency;
+
+        if ((*i)->provides(MemWriteOp))
+            (*i)->opLatency(MemWriteOp) = hit_latency;
+    }
+}
+
+int
+FUPool::getUnit(OpClass capability)
+{
+    //  If this pool doesn't have the specified capability,
+    //  return this information to the caller
+    if (!capabilityList[capability])
+        return -2;
+
+    int fu_idx = fuPerCapList[capability].getFU();
+    int start_idx = fu_idx;
+
+    // Iterate through the circular queue if needed, stopping if we've reached
+    // the first element again.
+    while (unitBusy[fu_idx]) {
+        fu_idx = fuPerCapList[capability].getFU();
+        if (fu_idx == start_idx) {
+            // No FU available
+            return -1;
+        }
+    }
+
+    unitBusy[fu_idx] = true;
+
+    return fu_idx;
+}
+
+void
+FUPool::freeUnit(int fu_idx)
+{
+    assert(unitBusy[fu_idx]);
+    unitsToBeFreed.push_back(fu_idx);
+}
+
+void
+FUPool::processFreeUnits()
+{
+    while (!unitsToBeFreed.empty()) {
+        int fu_idx = unitsToBeFreed.back();
+        unitsToBeFreed.pop_back();
+
+        assert(unitBusy[fu_idx]);
+
+        unitBusy[fu_idx] = false;
+    }
+}
+
+void
+FUPool::dump()
+{
+    cout << "Function Unit Pool (" << name() << ")\n";
+    cout << "======================================\n";
+    cout << "Free List:\n";
+
+    for (int i = 0; i < numFU; ++i) {
+        if (unitBusy[i]) {
+            continue;
+        }
+
+        cout << "  [" << i << "] : ";
+
+        cout << funcUnits[i]->name << " ";
+
+        cout << "\n";
+    }
+
+    cout << "======================================\n";
+    cout << "Busy List:\n";
+    for (int i = 0; i < numFU; ++i) {
+        if (!unitBusy[i]) {
+            continue;
+        }
+
+        cout << "  [" << i << "] : ";
+
+        cout << funcUnits[i]->name << " ";
+
+        cout << "\n";
+    }
+}
+
+//
+
+////////////////////////////////////////////////////////////////////////////
+//
+//  The SimObjects we use to get the FU information into the simulator
+//
+////////////////////////////////////////////////////////////////////////////
+
+//
+//    FUPool - Contails a list of FUDesc objects to make available
+//
+
+//
+//  The FuPool object
+//
+
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(FUPool)
+
+    SimObjectVectorParam<FUDesc *> FUList;
+
+END_DECLARE_SIM_OBJECT_PARAMS(FUPool)
+
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(FUPool)
+
+    INIT_PARAM(FUList, "list of FU's for this pool")
+
+END_INIT_SIM_OBJECT_PARAMS(FUPool)
+
+
+CREATE_SIM_OBJECT(FUPool)
+{
+    return new FUPool(getInstanceName(), FUList);
+}
+
+REGISTER_SIM_OBJECT("FUPool", FUPool)
+
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
new file mode 100644
index 000000000..d7b7acadb
--- /dev/null
+++ b/cpu/o3/fu_pool.hh
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2002-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_FU_POOL_HH__
+#define __CPU_O3_FU_POOL_HH__
+
+#include <bitset>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "base/sched_list.hh"
+#include "encumbered/cpu/full/op_class.hh"
+#include "sim/sim_object.hh"
+
+class FUDesc;
+class FuncUnit;
+
+/**
+ * Pool of FU's, specific to the new CPU model. The old FU pool had lists of
+ * free units and busy units, and whenever a FU was needed it would iterate
+ * through the free units to find a FU that provided the capability. This pool
+ * has lists of units specific to each of the capabilities, and whenever a FU
+ * is needed, it iterates through that list to find a free unit. The previous
+ * FU pool would have to be ticked each cycle to update which units became
+ * free. This FU pool lets the IEW stage handle freeing units, which frees
+ * them as their scheduled execution events complete. This limits units in this
+ * model to either have identical issue and op latencies, or 1 cycle issue
+ * latencies.
+ */
+class FUPool : public SimObject
+{
+  private:
+    /** Maximum op execution latencies, per op class. */
+    unsigned maxOpLatencies[Num_OpClasses];
+    /** Maximum issue latencies, per op class. */
+    unsigned maxIssueLatencies[Num_OpClasses];
+
+    /** Bitvector listing capabilities of this FU pool. */
+    std::bitset<Num_OpClasses> capabilityList;
+
+    /** Bitvector listing which FUs are busy. */
+    std::vector<bool> unitBusy;
+
+    /** List of units to be freed at the end of this cycle. */
+    std::vector<int> unitsToBeFreed;
+
+    /**
+     * Class that implements a circular queue to hold FU indices. The hope is
+     * that FUs that have been just used will be moved to the end of the queue
+     * by iterating through it, thus leaving free units at the head of the
+     * queue.
+     */
+    class FUIdxQueue {
+      public:
+        /** Constructs a circular queue of FU indices. */
+        FUIdxQueue()
+            : idx(0), size(0)
+        { }
+
+        /** Adds a FU to the queue. */
+        inline void addFU(int fu_idx);
+
+        /** Returns the index of the FU at the head of the queue, and changes
+         *  the index to the next element.
+         */
+        inline int getFU();
+
+      private:
+        /** Circular queue index. */
+        int idx;
+
+        /** Size of the queue. */
+        int size;
+
+        /** Queue of FU indices. */
+        std::vector<int> funcUnitsIdx;
+    };
+
+    /** Per op class queues of FUs that provide that capability. */
+    FUIdxQueue fuPerCapList[Num_OpClasses];
+
+    /** Number of FUs. */
+    int numFU;
+
+    /** Functional units. */
+    std::vector<FuncUnit *> funcUnits;
+
+    typedef std::vector<FuncUnit *>::iterator fuListIterator;
+
+  public:
+
+    /** Constructs a FU pool. */
+    FUPool(std::string name, std::vector<FUDesc *> l);
+    ~FUPool();
+
+    /** Annotates units that provide memory operations. Included only because
+     *  old FU pool provided this function.
+     */
+    void annotateMemoryUnits(unsigned hit_latency);
+
+    /**
+     * Gets a FU providing the requested capability. Will mark the unit as busy,
+     * but leaves the freeing of the unit up to the IEW stage.
+     * @param capability The capability requested.
+     * @return Returns -2 if the FU pool does not have the capability, -1 if
+     * there is no free FU, and the FU's index otherwise.
+     */
+    int getUnit(OpClass capability);
+
+    /** Frees a FU at the end of this cycle. */
+    void freeUnit(int fu_idx);
+
+    /** Frees all FUs on the list. */
+    void processFreeUnits();
+
+    /** Returns the total number of FUs. */
+    int size() { return numFU; }
+
+    /** Debugging function used to dump FU information. */
+    void dump();
+
+    /** Returns the operation execution latency of the given capability. */
+    unsigned getOpLatency(OpClass capability) {
+        return maxOpLatencies[capability];
+    }
+
+    /** Returns the issue latency of the given capability. */
+    unsigned getIssueLatency(OpClass capability) {
+        return maxIssueLatencies[capability];
+    }
+};
+
+#endif // __CPU_O3_FU_POOL_HH__
diff --git a/cpu/o3/iew.cc b/cpu/o3/iew.cc
index 45b5610e7..90d035f71 100644
--- a/cpu/o3/iew.cc
+++ b/cpu/o3/iew.cc
@@ -31,4 +31,4 @@
 #include "cpu/o3/iew_impl.hh"
 #include "cpu/o3/inst_queue.hh"
 
-template class SimpleIEW<AlphaSimpleImpl>;
+template class DefaultIEW<AlphaSimpleImpl>;
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 1e370d4e6..e55837812 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -26,22 +26,38 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-//Todo: Update with statuses.
-//Need to handle delaying writes to the writeback bus if it's full at the
-//given time.
-
-#ifndef __CPU_O3_CPU_SIMPLE_IEW_HH__
-#define __CPU_O3_CPU_SIMPLE_IEW_HH__
+#ifndef __CPU_O3_IEW_HH__
+#define __CPU_O3_IEW_HH__
 
 #include <queue>
 
-#include "config/full_system.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "config/full_system.hh"
 #include "cpu/o3/comm.hh"
-
+#include "cpu/o3/scoreboard.hh"
+#include "cpu/o3/lsq.hh"
+
+class FUPool;
+
+/**
+ * DefaultIEW handles both single threaded and SMT IEW(issue/execute/writeback).
+ * It handles the dispatching of instructions to the LSQ/IQ as part of the issue
+ * stage, and has the IQ try to issue instructions each cycle. The execute
+ * latency is actually tied into the issue latency to allow the IQ to be able to
+ * do back-to-back scheduling without having to speculatively schedule
+ * instructions. This happens by having the IQ have access to the functional
+ * units, and the IQ gets the execution latencies from the FUs when it issues
+ * instructions. Instructions reach the execute stage on the last cycle of
+ * their execution, which is when the IQ knows to wake up any dependent
+ * instructions, allowing back to back scheduling. The execute portion of IEW
+ * separates memory instructions from non-memory instructions, either telling
+ * the LSQ to execute the instruction, or executing the instruction directly.
+ * The writeback portion of IEW completes the instructions by waking up any
+ * dependents, and marking the register ready on the scoreboard.
+ */
 template<class Impl>
-class SimpleIEW
+class DefaultIEW
 {
   private:
     //Typedefs from Impl
@@ -52,7 +68,7 @@ class SimpleIEW
 
     typedef typename CPUPol::IQ IQ;
     typedef typename CPUPol::RenameMap RenameMap;
-    typedef typename CPUPol::LDSTQ LDSTQ;
+    typedef typename CPUPol::LSQ LSQ;
 
     typedef typename CPUPol::TimeStruct TimeStruct;
     typedef typename CPUPol::IEWStruct IEWStruct;
@@ -60,77 +76,214 @@ class SimpleIEW
     typedef typename CPUPol::IssueStruct IssueStruct;
 
     friend class Impl::FullCPU;
+    friend class CPUPol::IQ;
+
   public:
+    /** Overall IEW stage status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
+     */
     enum Status {
+        Active,
+        Inactive
+    };
+
+    /** Status for Issue, Execute, and Writeback stages. */
+    enum StageStatus {
         Running,
         Blocked,
         Idle,
+        StartSquash,
         Squashing,
         Unblocking
     };
 
   private:
+    /** Overall stage status. */
     Status _status;
-    Status _issueStatus;
-    Status _exeStatus;
-    Status _wbStatus;
+    /** Dispatch status. */
+    StageStatus dispatchStatus[Impl::MaxThreads];
+    /** Execute status. */
+    StageStatus exeStatus;
+    /** Writeback status. */
+    StageStatus wbStatus;
 
   public:
-    class WritebackEvent : public Event {
+    /** LdWriteback event for a load completion. */
+    class LdWritebackEvent : public Event {
       private:
+        /** Instruction that is writing back data to the register file. */
         DynInstPtr inst;
-        SimpleIEW<Impl> *iewStage;
+        /** Pointer to IEW stage. */
+        DefaultIEW<Impl> *iewStage;
 
       public:
-        WritebackEvent(DynInstPtr &_inst, SimpleIEW<Impl> *_iew);
+        /** Constructs a load writeback event. */
+        LdWritebackEvent(DynInstPtr &_inst, DefaultIEW<Impl> *_iew);
 
+        /** Processes writeback event. */
         virtual void process();
+        /** Returns the description of the writeback event. */
         virtual const char *description();
     };
 
   public:
-    SimpleIEW(Params &params);
+    /** Constructs a DefaultIEW with the given parameters. */
+    DefaultIEW(Params *params);
 
+    /** Returns the name of the DefaultIEW stage. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Initializes stage; sends back the number of free IQ and LSQ entries. */
+    void initStage();
+
+    /** Sets CPU pointer for IEW, IQ, and LSQ. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets main time buffer used for backwards communication. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets time buffer for getting instructions coming from rename. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets time buffer to pass on instructions to commit. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
-    void setRenameMap(RenameMap *rm_ptr);
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
 
-    void squash();
+    /** Sets pointer to the scoreboard. */
+    void setScoreboard(Scoreboard *sb_ptr);
 
-    void squashDueToBranch(DynInstPtr &inst);
+    /** Sets page table pointer within LSQ. */
+//    void setPageTable(PageTable *pt_ptr);
 
-    void squashDueToMem(DynInstPtr &inst);
+    /** Squashes instructions in IEW for a specific thread. */
+    void squash(unsigned tid);
 
-    void block();
+    /** Wakes all dependents of a completed instruction. */
+    void wakeDependents(DynInstPtr &inst);
 
-    inline void unblock();
+    /** Tells memory dependence unit that a memory instruction needs to be
+     * rescheduled. It will re-execute once replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &inst);
 
-    void wakeDependents(DynInstPtr &inst);
+    /** Re-executes all rescheduled memory instructions. */
+    void replayMemInst(DynInstPtr &inst);
 
+    /** Sends an instruction to commit through the time buffer. */
     void instToCommit(DynInstPtr &inst);
 
+    /** Inserts unused instructions of a thread into the skid buffer. */
+    void skidInsert(unsigned tid);
+
+    /** Returns the max of the number of entries in all of the skid buffers. */
+    int skidCount();
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall IEW status based on all of the stages' statuses. */
+    void updateStatus();
+
+    /** Resets entries of the IQ and the LSQ. */
+    void resetEntries();
+
+    /** Tells the CPU to wakeup if it has descheduled itself due to no
+     * activity. Used mainly by the LdWritebackEvent.
+     */
+    void wakeCPU();
+
+    /** Reports to the CPU that there is activity this cycle. */
+    void activityThisCycle();
+
+    /** Tells CPU that the IEW stage is active and running. */
+    inline void activateStage();
+
+    /** Tells CPU that the IEW stage is inactive and idle. */
+    inline void deactivateStage();
+
+//#if !FULL_SYSTEM
+    /** Returns if the LSQ has any stores to writeback. */
+    bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
+//#endif
+
   private:
-    void dispatchInsts();
+    /** Sends commit proper information for a squash due to a branch
+     * mispredict.
+     */
+    void squashDueToBranch(DynInstPtr &inst, unsigned thread_id);
+
+    /** Sends commit proper information for a squash due to a memory order
+     * violation.
+     */
+    void squashDueToMemOrder(DynInstPtr &inst, unsigned thread_id);
+
+    /** Sends commit proper information for a squash due to memory becoming
+     * blocked (younger issued instructions must be retried).
+     */
+    void squashDueToMemBlocked(DynInstPtr &inst, unsigned thread_id);
 
+    /** Sets Dispatch to blocked, and signals back to other stages to block. */
+    void block(unsigned thread_id);
+
+    /** Unblocks Dispatch if the skid buffer is empty, and signals back to
+     * other stages to unblock.
+     */
+    void unblock(unsigned thread_id);
+
+    /** Determines proper actions to take given Dispatch's status. */
+    void dispatch(unsigned tid);
+
+    /** Dispatches instructions to IQ and LSQ. */
+    void dispatchInsts(unsigned tid);
+
+    /** Executes instructions. In the case of memory operations, it informs the
+     * LSQ to execute the instructions. Also handles any redirects that occur
+     * due to the executed instructions.
+     */
     void executeInsts();
 
+    /** Writebacks instructions. In our model, the instruction's execute()
+     * function atomically reads registers, executes, and writes registers.
+     * Thus this writeback only wakes up dependent instructions, and informs
+     * the scoreboard of registers becoming ready.
+     */
+    void writebackInsts();
+
+    /** Returns the number of valid, non-squashed instructions coming from
+     * rename to dispatch.
+     */
+    unsigned validInstsFromRename();
+
+    /** Reads the stall signals. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks if any of the stall conditions are currently true. */
+    bool checkStall(unsigned tid);
+
+    /** Processes inputs and changes state accordingly. */
+    void checkSignalsAndUpdate(unsigned tid);
+
+    /** Sorts instructions coming from rename into lists separated by thread. */
+    void sortInsts();
+
   public:
+    /** Ticks IEW stage, causing Dispatch, the IQ, the LSQ, Execute, and
+     * Writeback to run for one cycle.
+     */
     void tick();
 
-    void iew();
-
-    //Interfaces to objects inside and outside of IEW.
-    /** Time buffer interface. */
+  private:
+    /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
+    /** Wire to write information heading to previous stages. */
+    typename TimeBuffer<TimeStruct>::wire toFetch;
+
     /** Wire to get commit's output from backwards time buffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
@@ -158,32 +311,67 @@ class SimpleIEW
     /** Wire to write infromation heading to commit. */
     typename TimeBuffer<IEWStruct>::wire toCommit;
 
-    //Will need internal queue to hold onto instructions coming from
-    //the rename stage in case of a stall.
+    /** Queue of all instructions coming from rename this cycle. */
+    std::queue<DynInstPtr> insts[Impl::MaxThreads];
+
     /** Skid buffer between rename and IEW. */
-    std::queue<RenameStruct> skidBuffer;
+    std::queue<DynInstPtr> skidBuffer[Impl::MaxThreads];
 
-  protected:
+    /** Scoreboard pointer. */
+    Scoreboard* scoreboard;
+
+  public:
     /** Instruction queue. */
     IQ instQueue;
 
-    LDSTQ ldstQueue;
+    /** Load / store queue. */
+    LSQ ldstQueue;
 
-#if !FULL_SYSTEM
-  public:
-    void lsqWriteback();
-#endif
+    /** Pointer to the functional unit pool. */
+    FUPool *fuPool;
 
   private:
-    /** Pointer to rename map.  Might not want this stage to directly
-     *  access this though...
+    /** CPU pointer. */
+    FullCPU *cpu;
+
+    /** Records if IEW has written to the time buffer this cycle, so that the
+     * CPU can deschedule itself if there is no activity.
      */
-    RenameMap *renameMap;
+    bool wroteToTimeBuffer;
 
-    /** CPU interface. */
-    FullCPU *cpu;
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool commit;
+    };
+
+    /** Stages that are telling IEW to stall. */
+    Stalls stalls[Impl::MaxThreads];
+
+    /** Debug function to print instructions that are issued this cycle. */
+    void printAvailableInsts();
+
+  public:
+    /** Records if the LSQ needs to be updated on the next cycle, so that
+     * IEW knows if there will be activity on the next cycle.
+     */
+    bool updateLSQNextCycle;
 
   private:
+    /** Records if there is a fetch redirect on this cycle for each thread. */
+    bool fetchRedirect[Impl::MaxThreads];
+
+    /** Used to track if all instructions have been dispatched this cycle.
+     * If they have not, then blocking must have occurred, and the instructions
+     * would already be added to the skid buffer.
+     * @todo: Fix this hack.
+     */
+    bool dispatchedAllInsts;
+
+    /** Records if the queues have been changed (inserted or issued insts),
+     * so that IEW knows to broadcast the updated amount of free entries.
+     */
+    bool updatedQueues;
+
     /** Commit to IEW delay, in ticks. */
     unsigned commitToIEWDelay;
 
@@ -211,29 +399,63 @@ class SimpleIEW
      */
     unsigned executeWidth;
 
-    /** Number of cycles stage has been squashing.  Used so that the stage
-     *  knows when it can start unblocking, which is when the previous stage
-     *  has received the stall signal and clears up its outputs.
+    /** Index into queue of instructions being written back. */
+    unsigned wbNumInst;
+
+    /** Cycle number within the queue of instructions being written back.
+     * Used in case there are too many instructions writing back at the current
+     * cycle and writesbacks need to be scheduled for the future. See comments
+     * in instToCommit().
      */
-    unsigned cyclesSquashing;
+    unsigned wbCycle;
+
+    /** Number of active threads. */
+    unsigned numThreads;
+
+    /** Pointer to list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Maximum size of the skid buffer. */
+    unsigned skidBufferMax;
 
+    /** Stat for total number of idle cycles. */
     Stats::Scalar<> iewIdleCycles;
+    /** Stat for total number of squashing cycles. */
     Stats::Scalar<> iewSquashCycles;
+    /** Stat for total number of blocking cycles. */
     Stats::Scalar<> iewBlockCycles;
+    /** Stat for total number of unblocking cycles. */
     Stats::Scalar<> iewUnblockCycles;
-//    Stats::Scalar<> iewWBInsts;
+    /** Stat for total number of instructions dispatched. */
     Stats::Scalar<> iewDispatchedInsts;
+    /** Stat for total number of squashed instructions dispatch skips. */
     Stats::Scalar<> iewDispSquashedInsts;
+    /** Stat for total number of dispatched load instructions. */
     Stats::Scalar<> iewDispLoadInsts;
+    /** Stat for total number of dispatched store instructions. */
     Stats::Scalar<> iewDispStoreInsts;
+    /** Stat for total number of dispatched non speculative instructions. */
     Stats::Scalar<> iewDispNonSpecInsts;
+    /** Stat for number of times the IQ becomes full. */
     Stats::Scalar<> iewIQFullEvents;
+    /** Stat for number of times the LSQ becomes full. */
+    Stats::Scalar<> iewLSQFullEvents;
+    /** Stat for total number of executed instructions. */
     Stats::Scalar<> iewExecutedInsts;
+    /** Stat for total number of executed load instructions. */
     Stats::Scalar<> iewExecLoadInsts;
+    /** Stat for total number of executed store instructions. */
     Stats::Scalar<> iewExecStoreInsts;
+    /** Stat for total number of squashed instructions skipped at execute. */
     Stats::Scalar<> iewExecSquashedInsts;
+    /** Stat for total number of memory ordering violation events. */
     Stats::Scalar<> memOrderViolationEvents;
+    /** Stat for total number of incorrect predicted taken branches. */
     Stats::Scalar<> predictedTakenIncorrect;
+    /** Stat for total number of incorrect predicted not taken branches. */
+    Stats::Scalar<> predictedNotTakenIncorrect;
+    /** Stat for total number of mispredicted branches detected at execute. */
+    Stats::Formula branchMispredicts;
 };
 
-#endif // __CPU_O3_CPU_IEW_HH__
+#endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 85217dd10..21eb7dcf8 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -29,59 +29,84 @@
 // @todo: Fix the instantaneous communication among all the stages within
 // iew.  There's a clear delay between issue and execute, yet backwards
 // communication happens simultaneously.
-// Update the statuses for each stage.
 
 #include <queue>
 
 #include "base/timebuf.hh"
+#include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/iew.hh"
 
+using namespace std;
+
 template<class Impl>
-SimpleIEW<Impl>::WritebackEvent::WritebackEvent(DynInstPtr &_inst,
-                                                SimpleIEW<Impl> *_iew)
-    : Event(&mainEventQueue, CPU_Tick_Pri), inst(_inst), iewStage(_iew)
+DefaultIEW<Impl>::LdWritebackEvent::LdWritebackEvent(DynInstPtr &_inst,
+                                                     DefaultIEW<Impl> *_iew)
+    : Event(&mainEventQueue), inst(_inst), iewStage(_iew)
 {
     this->setFlags(Event::AutoDelete);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::WritebackEvent::process()
+DefaultIEW<Impl>::LdWritebackEvent::process()
 {
-    DPRINTF(IEW, "IEW: WRITEBACK EVENT!!!!\n");
+    DPRINTF(IEW, "Load writeback event [sn:%lli]\n", inst->seqNum);
+    DPRINTF(Activity, "Activity: Ld Writeback event [sn:%lli]\n", inst->seqNum);
+
+    //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
+
+    iewStage->wakeCPU();
+
+    if (inst->isSquashed()) {
+        inst = NULL;
+        return;
+    }
+
+    if (!inst->isExecuted()) {
+        inst->setExecuted();
+
+        // Execute again to copy data to proper place.
+        if (inst->isStore()) {
+            inst->completeAcc();
+        }
+    }
 
     // Need to insert instruction into queue to commit
     iewStage->instToCommit(inst);
-    // Need to execute second half of the instruction, do actual writing to
-    // registers and such
-    inst->execute();
+
+    //wroteToTimeBuffer = true;
+    iewStage->activityThisCycle();
+
+    inst = NULL;
 }
 
 template<class Impl>
 const char *
-SimpleIEW<Impl>::WritebackEvent::description()
+DefaultIEW<Impl>::LdWritebackEvent::description()
 {
-    return "LSQ writeback event";
+    return "Load writeback event";
 }
 
 template<class Impl>
-SimpleIEW<Impl>::SimpleIEW(Params &params)
+DefaultIEW<Impl>::DefaultIEW(Params *params)
     : // Just make this time buffer really big for now
+    // @todo: Make this into a parameter.
       issueToExecQueue(5, 5),
       instQueue(params),
       ldstQueue(params),
-      commitToIEWDelay(params.commitToIEWDelay),
-      renameToIEWDelay(params.renameToIEWDelay),
-      issueToExecuteDelay(params.issueToExecuteDelay),
-      issueReadWidth(params.issueWidth),
-      issueWidth(params.issueWidth),
-      executeWidth(params.executeWidth)
-{
-    DPRINTF(IEW, "IEW: executeIntWidth: %i.\n", params.executeIntWidth);
-    _status = Idle;
-    _issueStatus = Idle;
-    _exeStatus = Idle;
-    _wbStatus = Idle;
+      fuPool(params->fuPool),
+      commitToIEWDelay(params->commitToIEWDelay),
+      renameToIEWDelay(params->renameToIEWDelay),
+      issueToExecuteDelay(params->issueToExecuteDelay),
+      issueReadWidth(params->issueWidth),
+      issueWidth(params->issueWidth),
+      executeWidth(params->executeWidth),
+      numThreads(params->numberOfThreads)
+{
+    DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
+    _status = Active;
+    exeStatus = Running;
+    wbStatus = Idle;
 
     // Setup wire to read instructions coming from issue.
     fromIssue = issueToExecQueue.getWire(-issueToExecuteDelay);
@@ -89,15 +114,36 @@ SimpleIEW<Impl>::SimpleIEW(Params &params)
     // Instruction queue needs the queue between issue and execute.
     instQueue.setIssueToExecuteQueue(&issueToExecQueue);
 
+    instQueue.setIEW(this);
     ldstQueue.setIEW(this);
+
+    for (int i=0; i < numThreads; i++) {
+        dispatchStatus[i] = Running;
+        stalls[i].commit = false;
+        fetchRedirect[i] = false;
+    }
+
+    updateLSQNextCycle = false;
+
+    // @todo: Make into a parameter
+    skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
+}
+
+template <class Impl>
+std::string
+DefaultIEW<Impl>::name() const
+{
+    return cpu->name() + ".iew";
 }
 
 template <class Impl>
 void
-SimpleIEW<Impl>::regStats()
+DefaultIEW<Impl>::regStats()
 {
     instQueue.regStats();
 
+    //ldstQueue.regStats();
+
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
         .desc("Number of cycles IEW is idle");
@@ -140,6 +186,10 @@ SimpleIEW<Impl>::regStats()
         .name(name() + ".iewIQFullEvents")
         .desc("Number of times the IQ has become full, causing a stall");
 
+    iewLSQFullEvents
+        .name(name() + ".iewLSQFullEvents")
+        .desc("Number of times the LSQ has become full, causing a stall");
+
     iewExecutedInsts
         .name(name() + ".iewExecutedInsts")
         .desc("Number of executed instructions");
@@ -163,24 +213,51 @@ SimpleIEW<Impl>::regStats()
     predictedTakenIncorrect
         .name(name() + ".predictedTakenIncorrect")
         .desc("Number of branches that were predicted taken incorrectly");
+
+    predictedNotTakenIncorrect
+        .name(name() + ".predictedNotTakenIncorrect")
+        .desc("Number of branches that were predicted not taken incorrectly");
+
+    branchMispredicts
+        .name(name() + ".branchMispredicts")
+        .desc("Number of branch mispredicts detected at execute");
+
+    branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::initStage()
+{
+    for (int tid=0; tid < numThreads; tid++) {
+        toRename->iewInfo[tid].usedIQ = true;
+        toRename->iewInfo[tid].freeIQEntries =
+            instQueue.numFreeEntries(tid);
+
+        toRename->iewInfo[tid].usedLSQ = true;
+        toRename->iewInfo[tid].freeLSQEntries =
+            ldstQueue.numFreeEntries(tid);
+    }
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultIEW<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting CPU pointer.\n");
+    DPRINTF(IEW, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 
     instQueue.setCPU(cpu_ptr);
     ldstQueue.setCPU(cpu_ptr);
+
+    cpu->activateStage(FullCPU::IEWIdx);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting time buffer pointer.\n");
+    DPRINTF(IEW, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to read information from time buffer, from commit.
@@ -189,15 +266,17 @@ SimpleIEW<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     // Setup wire to write information back to previous stages.
     toRename = timeBuffer->getWire(0);
 
+    toFetch = timeBuffer->getWire(0);
+
     // Instruction queue also needs main time buffer.
     instQueue.setTimeBuffer(tb_ptr);
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting rename queue pointer.\n");
+    DPRINTF(IEW, "Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
 
     // Setup wire to read information from rename queue.
@@ -206,9 +285,9 @@ SimpleIEW<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
+DefaultIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting IEW queue pointer.\n");
+    DPRINTF(IEW, "Setting IEW queue pointer.\n");
     iewQueue = iq_ptr;
 
     // Setup wire to write instructions to commit.
@@ -217,355 +296,900 @@ SimpleIEW<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 
 template<class Impl>
 void
-SimpleIEW<Impl>::setRenameMap(RenameMap *rm_ptr)
+DefaultIEW<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    DPRINTF(IEW, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+
+    ldstQueue.setActiveThreads(at_ptr);
+    instQueue.setActiveThreads(at_ptr);
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::setScoreboard(Scoreboard *sb_ptr)
 {
-    DPRINTF(IEW, "IEW: Setting rename map pointer.\n");
-    renameMap = rm_ptr;
+    DPRINTF(IEW, "Setting scoreboard pointer.\n");
+    scoreboard = sb_ptr;
 }
 
+#if 0
 template<class Impl>
 void
-SimpleIEW<Impl>::squash()
+DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
 {
-    DPRINTF(IEW, "IEW: Squashing all instructions.\n");
-    _status = Squashing;
+    ldstQueue.setPageTable(pt_ptr);
+}
+#endif
+
+template<class Impl>
+void
+DefaultIEW<Impl>::squash(unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%i]: Squashing all instructions.\n",
+            tid);
 
     // Tell the IQ to start squashing.
-    instQueue.squash();
+    instQueue.squash(tid);
 
     // Tell the LDSTQ to start squashing.
-    ldstQueue.squash(fromCommit->commitInfo.doneSeqNum);
+    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum,tid);
+
+    updatedQueues = true;
+
+    // Clear the skid buffer in case it has any data in it.
+    while (!skidBuffer[tid].empty()) {
+
+        if (skidBuffer[tid].front()->isLoad() ||
+            skidBuffer[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
+
+        toRename->iewInfo[tid].dispatched++;
+
+        skidBuffer[tid].pop();
+    }
+
+    while (!insts[tid].empty()) {
+        if (insts[tid].front()->isLoad() ||
+            insts[tid].front()->isStore() ) {
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        }
+
+        toRename->iewInfo[tid].dispatched++;
+
+        insts[tid].pop();
+    }
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::squashDueToBranch(DynInstPtr &inst)
+DefaultIEW<Impl>::squashDueToBranch(DynInstPtr &inst, unsigned tid)
 {
-    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
-            inst->PC);
-    // Perhaps leave the squashing up to the ROB stage to tell it when to
-    // squash?
-    _status = Squashing;
+    DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, PC: %#x "
+            "[sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
     // Tell rename to squash through the time buffer.
-    toCommit->squash = true;
-    // Also send PC update information back to prior stages.
-    toCommit->squashedSeqNum = inst->seqNum;
-    toCommit->mispredPC = inst->readPC();
-    toCommit->nextPC = inst->readNextPC();
-    toCommit->branchMispredict = true;
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->mispredPC[tid] = inst->readPC();
+    toCommit->nextPC[tid] = inst->readNextPC();
+    toCommit->branchMispredict[tid] = true;
     // Prediction was incorrect, so send back inverse.
-    toCommit->branchTaken = inst->readNextPC() !=
+    toCommit->branchTaken[tid] = inst->readNextPC() !=
         (inst->readPC() + sizeof(TheISA::MachInst));
+
+    toCommit->includeSquashInst[tid] = false;
+    //toCommit->iewSquashNum[tid] = inst->seqNum;
+
+    wroteToTimeBuffer = true;
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::squashDueToMem(DynInstPtr &inst)
+DefaultIEW<Impl>::squashDueToMemOrder(DynInstPtr &inst, unsigned tid)
 {
-    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
-            inst->PC);
-    // Perhaps leave the squashing up to the ROB stage to tell it when to
-    // squash?
-    _status = Squashing;
+    DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, "
+            "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
     // Tell rename to squash through the time buffer.
-    toCommit->squash = true;
-    // Also send PC update information back to prior stages.
-    toCommit->squashedSeqNum = inst->seqNum;
-    toCommit->nextPC = inst->readNextPC();
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->nextPC[tid] = inst->readNextPC();
+
+    toCommit->includeSquashInst[tid] = false;
+    //toCommit->iewSquashNum[tid] = inst->seqNum;
+
+    wroteToTimeBuffer = true;
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::block()
+DefaultIEW<Impl>::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid)
 {
-    DPRINTF(IEW, "IEW: Blocking.\n");
-    // Set the status to Blocked.
-    _status = Blocked;
+    DPRINTF(IEW, "[tid:%i]: Memory blocked, squashing load and younger insts, "
+            "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
+
+    toCommit->squash[tid] = true;
+    toCommit->squashedSeqNum[tid] = inst->seqNum;
+    toCommit->nextPC[tid] = inst->readPC();
+
+    toCommit->includeSquashInst[tid] = true;
+
+    ldstQueue.setLoadBlockedHandled(tid);
+
+    wroteToTimeBuffer = true;
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::block(unsigned tid)
+{
+    DPRINTF(IEW, "[tid:%u]: Blocking.\n", tid);
+
+    if (dispatchStatus[tid] != Blocked &&
+        dispatchStatus[tid] != Unblocking) {
+        toRename->iewBlock[tid] = true;
+        wroteToTimeBuffer = true;
+    }
 
     // Add the current inputs to the skid buffer so they can be
     // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromRename);
+    skidInsert(tid);
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
+    // Set the status to Blocked.
+    dispatchStatus[tid] = Blocked;
 }
 
 template<class Impl>
-inline void
-SimpleIEW<Impl>::unblock()
+void
+DefaultIEW<Impl>::unblock(unsigned tid)
 {
-    // Check if there's information in the skid buffer.  If there is, then
-    // set status to unblocking, otherwise set it directly to running.
-    DPRINTF(IEW, "IEW: Reading instructions out of the skid "
-            "buffer.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
-
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toRename->iewInfo.stall = true;
-    } else {
-        DPRINTF(IEW, "IEW: Stage is done unblocking.\n");
-        _status = Running;
+    DPRINTF(IEW, "[tid:%i]: Reading instructions out of the skid "
+            "buffer %u.\n",tid, tid);
+
+    // If the skid bufffer is empty, signal back to previous stages to unblock.
+    // Also switch status to running.
+    if (skidBuffer[tid].empty()) {
+        toRename->iewUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+        DPRINTF(IEW, "[tid:%i]: Done unblocking.\n",tid);
+        dispatchStatus[tid] = Running;
     }
 }
 
 template<class Impl>
 void
-SimpleIEW<Impl>::wakeDependents(DynInstPtr &inst)
+DefaultIEW<Impl>::wakeDependents(DynInstPtr &inst)
 {
     instQueue.wakeDependents(inst);
 }
 
+template<class Impl>
+void
+DefaultIEW<Impl>::rescheduleMemInst(DynInstPtr &inst)
+{
+    instQueue.rescheduleMemInst(inst);
+}
 
 template<class Impl>
 void
-SimpleIEW<Impl>::instToCommit(DynInstPtr &inst)
+DefaultIEW<Impl>::replayMemInst(DynInstPtr &inst)
 {
+    instQueue.replayMemInst(inst);
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
+{
+    // First check the time slot that this instruction will write
+    // to.  If there are free write ports at the time, then go ahead
+    // and write the instruction to that time.  If there are not,
+    // keep looking back to see where's the first time there's a
+    // free slot.  What happens if you run out of free spaces?
+    // For now naively assume that all instructions take one cycle.
+    // Otherwise would have to look into the time buffer based on the
+    // latency of the instruction.
+    while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
+        ++wbNumInst;
+        if (wbNumInst == issueWidth) {
+            ++wbCycle;
+            wbNumInst = 0;
+        }
+
+        assert(wbCycle < 5);
+    }
 
+    // Add finished instruction to queue to commit.
+    (*iewQueue)[wbCycle].insts[wbNumInst] = inst;
+    (*iewQueue)[wbCycle].size++;
 }
 
 template <class Impl>
+unsigned
+DefaultIEW<Impl>::validInstsFromRename()
+{
+    unsigned inst_count = 0;
+
+    for (int i=0; i<fromRename->size; i++) {
+        if (!fromRename->insts[i]->squashed)
+            inst_count++;
+    }
+
+    return inst_count;
+}
+
+template<class Impl>
 void
-SimpleIEW<Impl>::dispatchInsts()
-{
-    ////////////////////////////////////////
-    // DISPATCH/ISSUE stage
-    ////////////////////////////////////////
-
-    //Put into its own function?
-    //Add instructions to IQ if there are any instructions there
-
-    // Check if there are any instructions coming from rename, and we're.
-    // not squashing.
-    if (fromRename->size > 0) {
-        int insts_to_add = fromRename->size;
-
-        // Loop through the instructions, putting them in the instruction
-        // queue.
-        for (int inst_num = 0; inst_num < insts_to_add; ++inst_num)
-        {
-            DynInstPtr inst = fromRename->insts[inst_num];
-
-            // Make sure there's a valid instruction there.
-            assert(inst);
-
-            DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
-                    inst->readPC());
-
-            // Be sure to mark these instructions as ready so that the
-            // commit stage can go ahead and execute them, and mark
-            // them as issued so the IQ doesn't reprocess them.
-            if (inst->isSquashed()) {
-                ++iewDispSquashedInsts;
-                continue;
-            } else if (instQueue.isFull()) {
-                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
-                // Call function to start blocking.
-                block();
-                // Tell previous stage to stall.
-                toRename->iewInfo.stall = true;
-
-                ++iewIQFullEvents;
-                break;
-            } else if (inst->isLoad()) {
-                DPRINTF(IEW, "IEW: Issue: Memory instruction "
-                        "encountered, adding to LDSTQ.\n");
-
-                // Reserve a spot in the load store queue for this
-                // memory access.
-                ldstQueue.insertLoad(inst);
-
-                ++iewDispLoadInsts;
-            } else if (inst->isStore()) {
-                ldstQueue.insertStore(inst);
+DefaultIEW<Impl>::skidInsert(unsigned tid)
+{
+    DynInstPtr inst = NULL;
 
-                ++iewDispStoreInsts;
-            } else if (inst->isNonSpeculative()) {
-                DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
-                        "encountered, skipping.\n");
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
 
-                // Same hack as with stores.
-                inst->setCanCommit();
+        insts[tid].pop();
+
+        DPRINTF(Decode,"[tid:%i]: Inserting [sn:%lli] PC:%#x into "
+                "dispatch skidBuffer %i\n",tid, inst->seqNum,
+                inst->readPC(),tid);
+
+        skidBuffer[tid].push(inst);
+    }
+
+    assert(skidBuffer[tid].size() <= skidBufferMax &&
+           "Skidbuffer Exceeded Max Size");
+}
+
+template<class Impl>
+int
+DefaultIEW<Impl>::skidCount()
+{
+    int max=0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned thread_count = skidBuffer[*threads++].size();
+        if (max < thread_count)
+            max = thread_count;
+    }
+
+    return max;
+}
+
+template<class Impl>
+bool
+DefaultIEW<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
+    }
+
+    return true;
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::updateStatus()
+{
+    bool any_unblocking = false;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (dispatchStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
+        }
+    }
+
+    // If there are no ready instructions waiting to be scheduled by the IQ,
+    // and there's no stores waiting to write back, and dispatch is not
+    // unblocking, then there is no internal activity for the IEW stage.
+    if (_status == Active && !instQueue.hasReadyInsts() &&
+        !ldstQueue.willWB() && !any_unblocking) {
+        DPRINTF(IEW, "IEW switching to idle\n");
+
+        deactivateStage();
+
+        _status = Inactive;
+    } else if (_status == Inactive && (instQueue.hasReadyInsts() ||
+                                       ldstQueue.willWB() ||
+                                       any_unblocking)) {
+        // Otherwise there is internal activity.  Set to active.
+        DPRINTF(IEW, "IEW switching to active\n");
+
+        activateStage();
+
+        _status = Active;
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::resetEntries()
+{
+    instQueue.resetEntries();
+    ldstQueue.resetEntries();
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::readStallSignals(unsigned tid)
+{
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
+
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
+    }
+}
+
+template <class Impl>
+bool
+DefaultIEW<Impl>::checkStall(unsigned tid)
+{
+    bool ret_val(false);
+
+    if (stalls[tid].commit) {
+        DPRINTF(IEW,"[tid:%i]: Stall from Commit stage detected.\n",tid);
+        ret_val = true;
+    } else if (instQueue.isFull(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: IQ  is full.\n",tid);
+        ret_val = true;
+    } else if (ldstQueue.isFull(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: LSQ is full\n",tid);
+
+        if (ldstQueue.numLoads(tid) > 0 ) {
+
+            DPRINTF(IEW,"[tid:%i]: LSQ oldest load: [sn:%i] \n",
+                    tid,ldstQueue.getLoadHeadSeqNum(tid));
+        }
+
+        if (ldstQueue.numStores(tid) > 0) {
+
+            DPRINTF(IEW,"[tid:%i]: LSQ oldest store: [sn:%i] \n",
+                    tid,ldstQueue.getStoreHeadSeqNum(tid));
+        }
+
+        ret_val = true;
+    } else if (ldstQueue.isStalled(tid)) {
+        DPRINTF(IEW,"[tid:%i]: Stall: LSQ stall detected.\n",tid);
+        ret_val = true;
+    }
+
+    return ret_val;
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is
+    // Check stall signals, block if there is.
+    // If status was Blocked
+    //     if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+
+    readStallSignals(tid);
+
+    if (fromCommit->commitInfo[tid].squash) {
+        squash(tid);
+
+        if (dispatchStatus[tid] == Blocked ||
+            dispatchStatus[tid] == Unblocking) {
+            toRename->iewUnblock[tid] = true;
+            wroteToTimeBuffer = true;
+        }
+
+        dispatchStatus[tid] = Squashing;
+
+        fetchRedirect[tid] = false;
+        return;
+    }
+
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(IEW, "[tid:%i]: ROB is still squashing.\n");
+
+        dispatchStatus[tid] = Squashing;
+
+        return;
+    }
+
+    if (checkStall(tid)) {
+        block(tid);
+        dispatchStatus[tid] = Blocked;
+        return;
+    }
+
+    if (dispatchStatus[tid] == Blocked) {
+        // Status from previous cycle was blocked, but there are no more stall
+        // conditions.  Switch over to unblocking.
+        DPRINTF(IEW, "[tid:%i]: Done blocking, switching to unblocking.\n",
+                tid);
+
+        dispatchStatus[tid] = Unblocking;
+
+        unblock(tid);
+
+        return;
+    }
+
+    if (dispatchStatus[tid] == Squashing) {
+        // Switch status to running if rename isn't being told to block or
+        // squash this cycle.
+        DPRINTF(IEW, "[tid:%i]: Done squashing, switching to running.\n",
+                tid);
+
+        dispatchStatus[tid] = Running;
+
+        return;
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::sortInsts()
+{
+    int insts_from_rename = fromRename->size;
+
+    for (int i = 0; i < numThreads; i++)
+        assert(insts[i].empty());
+
+    for (int i = 0; i < insts_from_rename; ++i) {
+        insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]);
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::wakeCPU()
+{
+    cpu->wakeCPU();
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::activityThisCycle()
+{
+    DPRINTF(Activity, "Activity this cycle.\n");
+    cpu->activityThisCycle();
+}
+
+template <class Impl>
+inline void
+DefaultIEW<Impl>::activateStage()
+{
+    DPRINTF(Activity, "Activating stage.\n");
+    cpu->activateStage(FullCPU::IEWIdx);
+}
+
+template <class Impl>
+inline void
+DefaultIEW<Impl>::deactivateStage()
+{
+    DPRINTF(Activity, "Deactivating stage.\n");
+    cpu->deactivateStage(FullCPU::IEWIdx);
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::dispatch(unsigned tid)
+{
+    // If status is Running or idle,
+    //     call dispatchInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from rename
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (dispatchStatus[tid] == Blocked) {
+        ++iewBlockCycles;
+
+    } else if (dispatchStatus[tid] == Squashing) {
+        ++iewSquashCycles;
+    }
+
+    // Dispatch should try to dispatch as many instructions as its bandwidth
+    // will allow, as long as it is not currently blocked.
+    if (dispatchStatus[tid] == Running ||
+        dispatchStatus[tid] == Idle) {
+        DPRINTF(IEW, "[tid:%i] Not blocked, so attempting to run "
+                "dispatch.\n", tid);
+
+        dispatchInsts(tid);
+    } else if (dispatchStatus[tid] == Unblocking) {
+        // Make sure that the skid buffer has something in it if the
+        // status is unblocking.
+        assert(!skidsEmpty());
+
+        // If the status was unblocking, then instructions from the skid
+        // buffer were used.  Remove those instructions and handle
+        // the rest of unblocking.
+        dispatchInsts(tid);
+
+        ++iewUnblockCycles;
+
+        if (validInstsFromRename() && dispatchedAllInsts) {
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
+        }
+
+        unblock(tid);
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::dispatchInsts(unsigned tid)
+{
+    dispatchedAllInsts = true;
+
+    // Obtain instructions from skid buffer if unblocking, or queue from rename
+    // otherwise.
+    std::queue<DynInstPtr> &insts_to_dispatch =
+        dispatchStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
+
+    int insts_to_add = insts_to_dispatch.size();
+
+    DynInstPtr inst;
+    bool add_to_iq = false;
+    int dis_num_inst = 0;
+
+    // Loop through the instructions, putting them in the instruction
+    // queue.
+    for ( ; dis_num_inst < insts_to_add &&
+              dis_num_inst < issueReadWidth;
+          ++dis_num_inst)
+    {
+        inst = insts_to_dispatch.front();
+
+        if (dispatchStatus[tid] == Unblocking) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Examining instruction from skid "
+                    "buffer\n", tid);
+        }
+
+        // Make sure there's a valid instruction there.
+        assert(inst);
 
-                // Specificall insert it as nonspeculative.
+        DPRINTF(IEW, "[tid:%i]: Issue: Adding PC %#x [sn:%lli] [tid:%i] to "
+                "IQ.\n",
+                tid, inst->readPC(), inst->seqNum, inst->threadNumber);
+
+        // Be sure to mark these instructions as ready so that the
+        // commit stage can go ahead and execute them, and mark
+        // them as issued so the IQ doesn't reprocess them.
+        // -------------
+        // @TODO: What happens if the ldstqueue is full?
+        //        Do we process the other instructions?
+
+        // Check for squashed instructions.
+        if (inst->isSquashed()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Squashed instruction encountered, "
+                    "not adding to IQ.\n", tid);
+
+            ++iewDispSquashedInsts;
+
+            insts_to_dispatch.pop();
+
+            //Tell Rename That An Instruction has been processed
+            if (inst->isLoad() || inst->isStore()) {
+                toRename->iewInfo[tid].dispatchedToLSQ++;
+            }
+            toRename->iewInfo[tid].dispatched++;
+
+            continue;
+        }
+
+        // Check for full conditions.
+        if (instQueue.isFull(tid)) {
+            DPRINTF(IEW, "[tid:%i]: Issue: IQ has become full.\n", tid);
+
+            // Call function to start blocking.
+            block(tid);
+
+            // Set unblock to false. Special case where we are using
+            // skidbuffer (unblocking) instructions but then we still
+            // get full in the IQ.
+            toRename->iewUnblock[tid] = false;
+
+            dispatchedAllInsts = false;
+
+            ++iewIQFullEvents;
+            break;
+        } else if (ldstQueue.isFull(tid)) {
+            DPRINTF(IEW, "[tid:%i]: Issue: LSQ has become full.\n",tid);
+
+            // Call function to start blocking.
+            block(tid);
+
+            // Set unblock to false. Special case where we are using
+            // skidbuffer (unblocking) instructions but then we still
+            // get full in the IQ.
+            toRename->iewUnblock[tid] = false;
+
+            dispatchedAllInsts = false;
+
+            ++iewLSQFullEvents;
+            break;
+        }
+
+        // Otherwise issue the instruction just fine.
+        if (inst->isLoad()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            // Reserve a spot in the load store queue for this
+            // memory access.
+            ldstQueue.insertLoad(inst);
+
+            ++iewDispLoadInsts;
+
+            add_to_iq = true;
+
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+        } else if (inst->isStore()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            ldstQueue.insertStore(inst);
+
+            ++iewDispStoreInsts;
+
+            if (inst->isNonSpeculative()) {
+                inst->setCanCommit();
                 instQueue.insertNonSpec(inst);
+                add_to_iq = false;
 
                 ++iewDispNonSpecInsts;
+            } else {
+                add_to_iq = true;
+            }
 
-                continue;
-            } else if (inst->isNop()) {
-                DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
-                        ", skipping.\n");
+            toRename->iewInfo[tid].dispatchedToLSQ++;
+#if FULL_SYSTEM
+        } else if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+            inst->setCanCommit();
+            instQueue.insertBarrier(inst);
+            add_to_iq = false;
+#endif
+        } else if (inst->isNonSpeculative()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
+                    "encountered, skipping.\n", tid);
 
-                inst->setIssued();
-                inst->setExecuted();
-                inst->setCanCommit();
+            // Same hack as with stores.
+            inst->setCanCommit();
 
-                instQueue.advanceTail(inst);
+            // Specifically insert it as nonspeculative.
+            instQueue.insertNonSpec(inst);
 
-                continue;
-            } else if (inst->isExecuted()) {
-                assert(0 && "Instruction shouldn't be executed.\n");
-                DPRINTF(IEW, "IEW: Issue: Executed branch encountered, "
-                        "skipping.\n");
+            ++iewDispNonSpecInsts;
 
-                inst->setIssued();
-                inst->setCanCommit();
+            add_to_iq = false;
+        } else if (inst->isNop()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Nop instruction encountered, "
+                    "skipping.\n", tid);
 
-                instQueue.advanceTail(inst);
+            inst->setIssued();
+            inst->setExecuted();
+            inst->setCanCommit();
 
-                continue;
-            }
+            instQueue.advanceTail(inst);
+
+            add_to_iq = false;
+        } else if (inst->isExecuted()) {
+            assert(0 && "Instruction shouldn't be executed.\n");
+            DPRINTF(IEW, "Issue: Executed branch encountered, "
+                    "skipping.\n");
 
-            // If the instruction queue is not full, then add the
-            // instruction.
-            instQueue.insert(fromRename->insts[inst_num]);
+            inst->setIssued();
+            inst->setCanCommit();
 
-            ++iewDispatchedInsts;
+            instQueue.advanceTail(inst);
+
+            add_to_iq = false;
+        } else {
+            add_to_iq = true;
         }
+
+        // If the instruction queue is not full, then add the
+        // instruction.
+        if (add_to_iq) {
+            instQueue.insert(inst);
+        }
+
+        insts_to_dispatch.pop();
+
+        toRename->iewInfo[tid].dispatched++;
+
+        ++iewDispatchedInsts;
+    }
+
+    if (!insts_to_dispatch.empty()) {
+        DPRINTF(IEW,"[tid:%i]: Issue: Bandwidth Full. Blocking.\n");
+        block(tid);
+        toRename->iewUnblock[tid] = false;
     }
+
+    if (dispatchStatus[tid] == Idle && dis_num_inst) {
+        dispatchStatus[tid] = Running;
+
+        updatedQueues = true;
+    }
+
+    dis_num_inst = 0;
 }
 
 template <class Impl>
 void
-SimpleIEW<Impl>::executeInsts()
+DefaultIEW<Impl>::printAvailableInsts()
 {
-    ////////////////////////////////////////
-    //EXECUTE/WRITEBACK stage
-    ////////////////////////////////////////
+    int inst = 0;
+
+    cout << "Available Instructions: ";
+
+    while (fromIssue->insts[inst]) {
+
+        if (inst%3==0) cout << "\n\t";
+
+        cout << "PC: " << fromIssue->insts[inst]->readPC()
+             << " TN: " << fromIssue->insts[inst]->threadNumber
+             << " SN: " << fromIssue->insts[inst]->seqNum << " | ";
 
-    //Put into its own function?
-    //Similarly should probably have separate execution for int vs FP.
-    // Above comment is handled by the issue queue only issuing a valid
-    // mix of int/fp instructions.
-    //Actually okay to just have one execution, buuuuuut will need
-    //somewhere that defines the execution latency of all instructions.
-    // @todo: Move to the FU pool used in the current full cpu.
+        inst++;
 
-    int fu_usage = 0;
-    bool fetch_redirect = false;
-    int inst_slot = 0;
-    int time_slot = 0;
+    }
+
+    cout << "\n";
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::executeInsts()
+{
+    //bool fetch_redirect[(*activeThreads).size()];
+    wbNumInst = 0;
+    wbCycle = 0;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        fetchRedirect[tid] = false;
+    }
+
+#if 0
+    printAvailableInsts();
+#endif
 
     // Execute/writeback any instructions that are available.
-    for (int inst_num = 0;
-         fu_usage < executeWidth && /* Haven't exceeded available FU's. */
-             inst_num < issueWidth &&
-             fromIssue->insts[inst_num];
+    int inst_num = 0;
+    for ( ; inst_num < issueWidth &&  /* Haven't exceeded issue bandwidth */
+              fromIssue->insts[inst_num];
          ++inst_num) {
 
-        DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
+        DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
         // Get instruction from issue's queue.
         DynInstPtr inst = fromIssue->insts[inst_num];
 
-        DPRINTF(IEW, "IEW: Execute: Processing PC %#x.\n", inst->readPC());
+        DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
+                inst->readPC(), inst->threadNumber,inst->seqNum);
 
         // Check if the instruction is squashed; if so then skip it
         // and don't count it towards the FU usage.
         if (inst->isSquashed()) {
-            DPRINTF(IEW, "IEW: Execute: Instruction was squashed.\n");
+            DPRINTF(IEW, "Execute: Instruction was squashed.\n");
 
             // Consider this instruction executed so that commit can go
             // ahead and retire the instruction.
             inst->setExecuted();
 
-            toCommit->insts[inst_num] = inst;
+            // Not sure if I should set this here or just let commit try to
+            // commit any squashed instructions.  I like the latter a bit more.
+            inst->setCanCommit();
 
             ++iewExecSquashedInsts;
 
             continue;
         }
 
-        inst->setExecuted();
-
-        // If an instruction is executed, then count it towards FU usage.
-        ++fu_usage;
+        Fault fault = NoFault;
 
         // Execute instruction.
         // Note that if the instruction faults, it will be handled
         // at the commit stage.
-        if (inst->isMemRef()) {
-            DPRINTF(IEW, "IEW: Execute: Calculating address for memory "
+        if (inst->isMemRef() &&
+            (!inst->isDataPrefetch() && !inst->isInstPrefetch())) {
+            DPRINTF(IEW, "Execute: Calculating address for memory "
                     "reference.\n");
 
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isLoad()) {
-                ldstQueue.executeLoad(inst);
+                // Loads will mark themselves as executed, and their writeback
+                // event adds the instruction to the queue to commit
+                fault = ldstQueue.executeLoad(inst);
 
                 ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
                 ++iewExecStoreInsts;
+
+                // If the store had a fault then it may not have a mem req
+                if (inst->req && !(inst->req->flags & LOCKED)) {
+                    inst->setExecuted();
+
+                    instToCommit(inst);
+                }
+                // Store conditionals will mark themselves as executed, and
+                // their writeback event will add the instruction to the queue
+                // to commit.
             } else {
-                panic("IEW: Unexpected memory type!\n");
+                panic("Unexpected memory type!\n");
             }
 
         } else {
             inst->execute();
 
             ++iewExecutedInsts;
-        }
 
-        // First check the time slot that this instruction will write
-        // to.  If there are free write ports at the time, then go ahead
-        // and write the instruction to that time.  If there are not,
-        // keep looking back to see where's the first time there's a
-        // free slot.  What happens if you run out of free spaces?
-        // For now naively assume that all instructions take one cycle.
-        // Otherwise would have to look into the time buffer based on the
-        // latency of the instruction.
-        (*iewQueue)[time_slot].insts[inst_slot];
-        while ((*iewQueue)[time_slot].insts[inst_slot]) {
-            if (inst_slot < issueWidth) {
-                ++inst_slot;
-            } else {
-                ++time_slot;
-                inst_slot = 0;
-            }
+            inst->setExecuted();
 
-            assert(time_slot < 5);
+            instToCommit(inst);
         }
 
-        // May actually have to work this out, especially with loads and stores
-
-        // Add finished instruction to queue to commit.
-        (*iewQueue)[time_slot].insts[inst_slot] = inst;
-        (*iewQueue)[time_slot].size++;
-
         // Check if branch was correct.  This check happens after the
         // instruction is added to the queue because even if the branch
         // is mispredicted, the branch instruction itself is still valid.
         // Only handle this if there hasn't already been something that
         // redirects fetch in this group of instructions.
-        if (!fetch_redirect) {
+
+        // This probably needs to prioritize the redirects if a different
+        // scheduler is used.  Currently the scheduler schedules the oldest
+        // instruction first, so the branch resolution order will be correct.
+        unsigned tid = inst->threadNumber;
+
+        if (!fetchRedirect[tid]) {
+
             if (inst->mispredicted()) {
-                fetch_redirect = true;
+                fetchRedirect[tid] = true;
 
-                DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
-                DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
+                DPRINTF(IEW, "Execute: Branch mispredict detected.\n");
+                DPRINTF(IEW, "Execute: Redirecting fetch to PC: %#x.\n",
                         inst->nextPC);
 
                 // If incorrect, then signal the ROB that it must be squashed.
-                squashDueToBranch(inst);
+                squashDueToBranch(inst, tid);
 
                 if (inst->predTaken()) {
                     predictedTakenIncorrect++;
+                } else {
+                    predictedNotTakenIncorrect++;
                 }
-            } else if (ldstQueue.violation()) {
-                fetch_redirect = true;
+            } else if (ldstQueue.violation(tid)) {
+                fetchRedirect[tid] = true;
 
-                // Get the DynInst that caused the violation.
-                DynInstPtr violator = ldstQueue.getMemDepViolator();
+                // Get the DynInst that caused the violation.  Note that this
+                // clears the violation signal.
+                DynInstPtr violator;
+                violator = ldstQueue.getMemDepViolator(tid);
 
-                DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
+                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                         "%#x, inst PC: %#x.  Addr is: %#x.\n",
                         violator->readPC(), inst->readPC(), inst->physEffAddr);
 
@@ -573,164 +1197,196 @@ SimpleIEW<Impl>::executeInsts()
                 instQueue.violation(inst, violator);
 
                 // Squash.
-                squashDueToMem(inst);
+                squashDueToMemOrder(inst,tid);
 
                 ++memOrderViolationEvents;
+            } else if (ldstQueue.loadBlocked(tid) &&
+                       !ldstQueue.isLoadBlockedHandled(tid)) {
+                fetchRedirect[tid] = true;
+
+                DPRINTF(IEW, "Load operation couldn't execute because the "
+                        "memory system is blocked.  PC: %#x [sn:%lli]\n",
+                        inst->readPC(), inst->seqNum);
+
+                squashDueToMemBlocked(inst, tid);
             }
         }
     }
+
+    if (inst_num) {
+        if (exeStatus == Idle) {
+            exeStatus = Running;
+        }
+
+        updatedQueues = true;
+
+        cpu->activityThisCycle();
+    }
+
+    // Need to reset this in case a writeback event needs to write into the
+    // iew queue.  That way the writeback event will write into the correct
+    // spot in the queue.
+    wbNumInst = 0;
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleIEW<Impl>::tick()
+DefaultIEW<Impl>::writebackInsts()
 {
-    // Considering putting all the state-determining stuff in this section.
+    // Loop through the head of the time buffer and wake any dependents.
+    // These instructions are about to write back.  In the simple model
+    // this loop can really happen within the previous loop, but when
+    // instructions have actual latencies, this loop must be separate.
+    // Also mark scoreboard that this instruction is finally complete.
+    // Either have IEW have direct access to rename map, or have this as
+    // part of backwards communication.
+    for (int inst_num = 0; inst_num < issueWidth &&
+             toCommit->insts[inst_num]; inst_num++) {
+        DynInstPtr inst = toCommit->insts[inst_num];
 
+        DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
+                inst->readPC());
+
+        // Some instructions will be sent to commit without having
+        // executed because they need commit to handle them.
+        // E.g. Uncached loads have not actually executed when they
+        // are first sent to commit.  Instead commit must tell the LSQ
+        // when it's ready to execute the uncached load.
+        if (!inst->isSquashed() && inst->isExecuted()) {
+            instQueue.wakeDependents(inst);
+
+            for (int i = 0; i < inst->numDestRegs(); i++) {
+                //mark as Ready
+                DPRINTF(IEW,"Setting Destination Register %i\n",
+                        inst->renamedDestRegIdx(i));
+                scoreboard->setReg(inst->renamedDestRegIdx(i));
+            }
+        }
+    }
+}
+
+template<class Impl>
+void
+DefaultIEW<Impl>::tick()
+{
     // Try to fill up issue queue with as many instructions as bandwidth
     // allows.
-    // Decode should try to execute as many instructions as its bandwidth
-    // will allow, as long as it is not currently blocked.
+    wbNumInst = 0;
+    wbCycle = 0;
 
-    // Check if the stage is in a running status.
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(IEW, "IEW: Status is not blocked, attempting to run "
-                     "stage.\n");
-        iew();
+    wroteToTimeBuffer = false;
+    updatedQueues = false;
 
-        // If it's currently unblocking, check to see if it should switch
-        // to running.
-        if (_status == Unblocking) {
-            unblock();
+    sortInsts();
 
-            ++iewUnblockCycles;
-        }
-    } else if (_status == Squashing) {
+    list<unsigned>::iterator threads = (*activeThreads).begin();
 
-        DPRINTF(IEW, "IEW: Still squashing.\n");
+    // Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+           unsigned tid = *threads++;
 
-        // Check if stage should remain squashing.  Stop squashing if the
-        // squash signal clears.
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-            DPRINTF(IEW, "IEW: Done squashing, changing status to "
-                    "running.\n");
+        DPRINTF(IEW,"Issue: Processing [tid:%i]\n",tid);
 
-            _status = Running;
-            instQueue.stopSquash();
-        } else {
-            instQueue.doSquash();
-        }
+        checkSignalsAndUpdate(tid);
+        dispatch(tid);
 
-        ++iewSquashCycles;
-    } else if (_status == Blocked) {
-        // Continue to tell previous stage to stall.
-        toRename->iewInfo.stall = true;
-
-        // Check if possible stall conditions have cleared.
-        if (!fromCommit->commitInfo.stall &&
-            !instQueue.isFull()) {
-            DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n");
-            _status = Unblocking;
-        }
+    }
 
-        // If there's still instructions coming from rename, continue to
-        // put them on the skid buffer.
-        if (fromRename->size == 0) {
-            block();
-        }
+    if (exeStatus != Squashing) {
+        executeInsts();
 
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-        }
+        writebackInsts();
 
-        ++iewBlockCycles;
+        // Have the instruction queue try to schedule any ready instructions.
+        // (In actuality, this scheduling is for instructions that will
+        // be executed next cycle.)
+        instQueue.scheduleReadyInsts();
+
+        // Also should advance its own time buffers if the stage ran.
+        // Not the best place for it, but this works (hopefully).
+        issueToExecQueue.advance();
     }
 
-    // @todo: Maybe put these at the beginning, so if it's idle it can
-    // return early.
-    // Write back number of free IQ entries here.
-    toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries();
+    bool broadcast_free_entries = false;
+
+    if (updatedQueues || exeStatus == Running || updateLSQNextCycle) {
+        exeStatus = Idle;
+        updateLSQNextCycle = false;
+
+        broadcast_free_entries = true;
+    }
 
+    // Writeback any stores using any leftover bandwidth.
     ldstQueue.writebackStores();
 
+    // Free function units marked as being freed this cycle.
+    fuPool->processFreeUnits();
+
     // Check the committed load/store signals to see if there's a load
     // or store to commit.  Also check if it's being told to execute a
     // nonspeculative instruction.
     // This is pretty inefficient...
-    if (!fromCommit->commitInfo.squash &&
-        !fromCommit->commitInfo.robSquashing) {
-        ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum);
-        ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum);
-    }
 
-    if (fromCommit->commitInfo.nonSpecSeqNum != 0) {
-        instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum);
-    }
+    threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = (*threads++);
 
-    DPRINTF(IEW, "IEW: IQ has %i free entries.\n",
-            instQueue.numFreeEntries());
-}
+        DPRINTF(IEW,"Processing [tid:%i]\n",tid);
 
-template<class Impl>
-void
-SimpleIEW<Impl>::iew()
-{
-    // Might want to put all state checks in the tick() function.
-    // Check if being told to stall from commit.
-    if (fromCommit->commitInfo.stall) {
-        block();
-        return;
-    } else if (fromCommit->commitInfo.squash ||
-               fromCommit->commitInfo.robSquashing) {
-        // Also check if commit is telling this stage to squash.
-        squash();
-        return;
-    }
+        if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
+            !fromCommit->commitInfo[tid].squash &&
+            !fromCommit->commitInfo[tid].robSquashing) {
 
-    dispatchInsts();
+            ldstQueue.commitStores(fromCommit->commitInfo[tid].doneSeqNum,tid);
 
-    // Have the instruction queue try to schedule any ready instructions.
-    instQueue.scheduleReadyInsts();
+            ldstQueue.commitLoads(fromCommit->commitInfo[tid].doneSeqNum,tid);
 
-    executeInsts();
+            updateLSQNextCycle = true;
+            instQueue.commit(fromCommit->commitInfo[tid].doneSeqNum,tid);
+        }
 
-    // Loop through the head of the time buffer and wake any dependents.
-    // These instructions are about to write back.  In the simple model
-    // this loop can really happen within the previous loop, but when
-    // instructions have actual latencies, this loop must be separate.
-    // Also mark scoreboard that this instruction is finally complete.
-    // Either have IEW have direct access to rename map, or have this as
-    // part of backwards communication.
-    for (int inst_num = 0; inst_num < issueWidth &&
-             toCommit->insts[inst_num]; inst_num++)
-    {
-        DynInstPtr inst = toCommit->insts[inst_num];
+        if (fromCommit->commitInfo[tid].nonSpecSeqNum != 0) {
 
-        DPRINTF(IEW, "IEW: Sending instructions to commit, PC %#x.\n",
-                inst->readPC());
+            //DPRINTF(IEW,"NonspecInst from thread %i",tid);
+            if (fromCommit->commitInfo[tid].uncached) {
+                instQueue.replayMemInst(fromCommit->commitInfo[tid].uncachedLoad);
+            } else {
+                instQueue.scheduleNonSpec(
+                    fromCommit->commitInfo[tid].nonSpecSeqNum);
+            }
+        }
 
-        if(!inst->isSquashed()) {
-            instQueue.wakeDependents(inst);
+        if (broadcast_free_entries) {
+            toFetch->iewInfo[tid].iqCount =
+                instQueue.getCount(tid);
+            toFetch->iewInfo[tid].ldstqCount =
+                ldstQueue.getCount(tid);
 
-            for (int i = 0; i < inst->numDestRegs(); i++)
-            {
-                renameMap->markAsReady(inst->renamedDestRegIdx(i));
-            }
+            toRename->iewInfo[tid].usedIQ = true;
+            toRename->iewInfo[tid].freeIQEntries =
+                instQueue.numFreeEntries();
+            toRename->iewInfo[tid].usedLSQ = true;
+            toRename->iewInfo[tid].freeLSQEntries =
+                ldstQueue.numFreeEntries(tid);
+
+            wroteToTimeBuffer = true;
         }
+
+        DPRINTF(IEW, "[tid:%i], Dispatch dispatched %i instructions.\n",
+                tid, toRename->iewInfo[tid].dispatched);
+
+        //thread_queue.pop();
     }
 
-    // Also should advance its own time buffers if the stage ran.
-    // Not the best place for it, but this works (hopefully).
-    issueToExecQueue.advance();
-}
+    DPRINTF(IEW, "IQ has %i free entries (Can schedule: %i).  "
+            "LSQ has %i free entries.\n",
+            instQueue.numFreeEntries(), instQueue.hasReadyInsts(),
+            ldstQueue.numFreeEntries());
 
-#if !FULL_SYSTEM
-template<class Impl>
-void
-SimpleIEW<Impl>::lsqWriteback()
-{
-    ldstQueue.writebackAllInsts();
+    updateStatus();
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+        cpu->activityThisCycle();
+    }
 }
-#endif
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 43fe96c49..283bbdc22 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_INST_QUEUE_HH__
-#define __CPU_O3_CPU_INST_QUEUE_HH__
+#ifndef __CPU_O3_INST_QUEUE_HH__
+#define __CPU_O3_INST_QUEUE_HH__
 
 #include <list>
 #include <map>
@@ -37,8 +37,12 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
+#include "encumbered/cpu/full/op_class.hh"
 #include "sim/host.hh"
 
+class FUPool;
+class MemInterface;
+
 /**
  * A standard instruction queue class.  It holds ready instructions, in
  * order, in seperate priority queues to facilitate the scheduling of
@@ -47,7 +51,14 @@
  * floating point registers have their indices start after the integer
  * registers (ie with 96 int and 96 fp registers, regs 0-95 are integer
  * and 96-191 are fp).  This remains true even for both logical and
- * physical register indices.
+ * physical register indices. The IQ depends on the memory dependence unit to
+ * track when memory operations are ready in terms of ordering; register
+ * dependencies are tracked normally. Right now the IQ also handles the
+ * execution timing; this is mainly to allow back-to-back scheduling without
+ * requiring IEW to be able to peek into the IQ. At the end of the execution
+ * latency, the instruction is put into the queue to execute, where it will
+ * have the execute() function called on it.
+ * @todo: Make IQ able to handle multiple FU pools.
  */
 template <class Impl>
 class InstructionQueue
@@ -58,87 +69,178 @@ class InstructionQueue
     typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::Params Params;
 
+    typedef typename Impl::CPUPol::IEW IEW;
     typedef typename Impl::CPUPol::MemDepUnit MemDepUnit;
     typedef typename Impl::CPUPol::IssueStruct IssueStruct;
     typedef typename Impl::CPUPol::TimeStruct TimeStruct;
 
-    // Typedef of iterator through the list of instructions.  Might be
-    // better to untie this from the FullCPU or pass its information to
-    // the stages.
+    // Typedef of iterator through the list of instructions.
     typedef typename std::list<DynInstPtr>::iterator ListIt;
 
-    /**
-     * Struct for comparing entries to be added to the priority queue.  This
-     * gives reverse ordering to the instructions in terms of sequence
-     * numbers: the instructions with smaller sequence numbers (and hence
-     * are older) will be at the top of the priority queue.
-     */
-    struct pqCompare
-    {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum > rhs->seqNum;
-        }
-    };
+    friend class Impl::FullCPU;
 
-    /**
-     * Struct for comparing entries to be added to the set.  This gives
-     * standard ordering in terms of sequence numbers.
-     */
-    struct setCompare
-    {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum < rhs->seqNum;
-        }
+    /** FU completion event class. */
+    class FUCompletion : public Event {
+      private:
+        /** Executing instruction. */
+        DynInstPtr inst;
+
+        /** Index of the FU used for executing. */
+        int fuIdx;
+
+        /** Pointer back to the instruction queue. */
+        InstructionQueue<Impl> *iqPtr;
+
+      public:
+        /** Construct a FU completion event. */
+        FUCompletion(DynInstPtr &_inst, int fu_idx,
+                     InstructionQueue<Impl> *iq_ptr);
+
+        virtual void process();
+        virtual const char *description();
     };
 
-    typedef std::priority_queue<DynInstPtr, vector<DynInstPtr>, pqCompare>
-    ReadyInstQueue;
+    /** Constructs an IQ. */
+    InstructionQueue(Params *params);
+
+    /** Destructs the IQ. */
+    ~InstructionQueue();
 
-    InstructionQueue(Params &params);
+    /** Returns the name of the IQ. */
+    std::string name() const;
 
+    /** Registers statistics. */
     void regStats();
 
-    void setCPU(FullCPU *cpu);
+    /** Sets CPU pointer. */
+    void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
+    /** Sets active threads list. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets the IEW pointer. */
+    void setIEW(IEW *iew_ptr) { iewStage = iew_ptr; }
+
+    /** Sets the timer buffer between issue and execute. */
     void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue);
 
+    /** Sets the global time buffer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Number of entries needed for given amount of threads. */
+    int entryAmount(int num_threads);
+
+    /** Resets max entries for all threads. */
+    void resetEntries();
+
+    /** Returns total number of free entries. */
     unsigned numFreeEntries();
 
+    /** Returns number of free entries for a thread. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns whether or not the IQ is full. */
     bool isFull();
 
+    /** Returns whether or not the IQ is full for a specific thread. */
+    bool isFull(unsigned tid);
+
+    /** Returns if there are any ready instructions in the IQ. */
+    bool hasReadyInsts();
+
+    /** Inserts a new instruction into the IQ. */
     void insert(DynInstPtr &new_inst);
 
+    /** Inserts a new, non-speculative instruction into the IQ. */
     void insertNonSpec(DynInstPtr &new_inst);
 
+    /** Inserts a memory or write barrier into the IQ to make sure
+     *  loads and stores are ordered properly.
+     */
+    void insertBarrier(DynInstPtr &barr_inst);
+
+    /**
+     * Advances the tail of the IQ, used if an instruction is not added to the
+     * IQ for scheduling.
+     * @todo: Rename this function.
+     */
     void advanceTail(DynInstPtr &inst);
 
+    /** Process FU completion event. */
+    void processFUCompletion(DynInstPtr &inst, int fu_idx);
+
+    /**
+     * Schedules ready instructions, adding the ready ones (oldest first) to
+     * the queue to execute.
+     */
     void scheduleReadyInsts();
 
+    /** Schedules a single specific non-speculative instruction. */
     void scheduleNonSpec(const InstSeqNum &inst);
 
+    /**
+     * Commits all instructions up to and including the given sequence number,
+     * for a specific thread.
+     */
+    void commit(const InstSeqNum &inst, unsigned tid = 0);
+
+    /** Wakes all dependents of a completed instruction. */
     void wakeDependents(DynInstPtr &completed_inst);
 
+    /** Adds a ready memory instruction to the ready list. */
+    void addReadyMemInst(DynInstPtr &ready_inst);
+
+    /**
+     * Reschedules a memory instruction. It will be ready to issue once
+     * replayMemInst() is called.
+     */
+    void rescheduleMemInst(DynInstPtr &resched_inst);
+
+    /** Replays a memory instruction. It must be rescheduled first. */
+    void replayMemInst(DynInstPtr &replay_inst);
+
+    /** Completes a memory operation. */
+    void completeMemInst(DynInstPtr &completed_inst);
+
+    /** Indicates an ordering violation between a store and a load. */
     void violation(DynInstPtr &store, DynInstPtr &faulting_load);
 
-    // Change this to take in the sequence number
-    void squash();
+    /**
+     * Squashes instructions for a thread. Squashing information is obtained
+     * from the time buffer.
+     */
+    void squash(unsigned tid);
+
+    /** Returns the number of used entries for a thread. */
+    unsigned getCount(unsigned tid) { return count[tid]; };
 
-    void doSquash();
+    /** Updates the number of free entries. */
+    void updateFreeEntries(int num) { freeEntries += num; }
 
-    void stopSquash();
+    /** Debug function to print all instructions. */
+    void printInsts();
 
   private:
+    /** Does the actual squashing. */
+    void doSquash(unsigned tid);
+
+    /////////////////////////
+    // Various pointers
+    /////////////////////////
+
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** Cache interface. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to IEW stage. */
+    IEW *iewStage;
+
     /** The memory dependence unit, which tracks/predicts memory dependences
      *  between instructions.
      */
-    MemDepUnit memDepUnit;
+    MemDepUnit memDepUnit[Impl::MaxThreads];
 
     /** The queue to the execute stage.  Issued instructions will be written
      *  into it.
@@ -151,36 +253,45 @@ class InstructionQueue
     /** Wire to read information from timebuffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
-    enum InstList {
-        Int,
-        Float,
-        Branch,
-        Memory,
-        Misc,
-        Squashed,
-        None
-    };
+    /** Function unit pool. */
+    FUPool *fuPool;
 
-    /** List of ready int instructions.  Used to keep track of the order in
-     *  which instructions should issue.
-     */
-    ReadyInstQueue readyIntInsts;
+    //////////////////////////////////////
+    // Instruction lists, ready queues, and ordering
+    //////////////////////////////////////
 
-    /** List of ready floating point instructions. */
-    ReadyInstQueue readyFloatInsts;
+    /** List of all the instructions in the IQ (some of which may be issued). */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
 
-    /** List of ready branch instructions. */
-    ReadyInstQueue readyBranchInsts;
+    /**
+     * Struct for comparing entries to be added to the priority queue.  This
+     * gives reverse ordering to the instructions in terms of sequence
+     * numbers: the instructions with smaller sequence numbers (and hence
+     * are older) will be at the top of the priority queue.
+     */
+    struct pqCompare {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+        {
+            return lhs->seqNum > rhs->seqNum;
+        }
+    };
 
-    /** List of ready miscellaneous instructions. */
-    ReadyInstQueue readyMiscInsts;
+    /**
+     * Struct for an IQ entry. It includes the instruction and an iterator
+     * to the instruction's spot in the IQ.
+     */
+    struct IQEntry {
+        DynInstPtr inst;
+        ListIt iqIt;
+    };
 
-    /** List of squashed instructions (which are still valid and in IQ).
-     *  Implemented using a priority queue; the entries must contain both
-     *  the IQ index and sequence number of each instruction so that
-     *  ordering based on sequence numbers can be used.
+    typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
+    ReadyInstQueue;
+
+    /** List of ready instructions, per op class.  They are separated by op
+     *  class to allow for easy mapping to FUs.
      */
-    ReadyInstQueue squashedInsts;
+    ReadyInstQueue readyInsts[Num_OpClasses];
 
     /** List of non-speculative instructions that will be scheduled
      *  once the IQ gets a signal from commit.  While it's redundant to
@@ -188,37 +299,79 @@ class InstructionQueue
      *  inside of DynInst), when these instructions are woken up only
      *  the sequence number will be available.  Thus it is most efficient to be
      *  able to search by the sequence number alone.
+     *  @todo: Maybe change this to a priority queue per thread.
      */
     std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
 
-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator non_spec_it_t;
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator NonSpecMapIt;
 
-    /** Number of free IQ entries left. */
-    unsigned freeEntries;
+    /** Entry for the list age ordering by op class. */
+    struct ListOrderEntry {
+        OpClass queueType;
+        InstSeqNum oldestInst;
+    };
 
-    /** The number of entries in the instruction queue. */
-    unsigned numEntries;
+    /** List that contains the age order of the oldest instruction of each
+     *  ready queue.  Used to select the oldest instruction available
+     *  among op classes.
+     */
+    std::list<ListOrderEntry> listOrder;
+
+    typedef typename std::list<ListOrderEntry>::iterator ListOrderIt;
+
+    /** Tracks if each ready queue is on the age order list. */
+    bool queueOnList[Num_OpClasses];
 
-    /** The number of integer instructions that can be issued in one
-     *  cycle.
+    /** Iterators of each ready queue.  Points to their spot in the age order
+     *  list.
      */
-    unsigned intWidth;
+    ListOrderIt readyIt[Num_OpClasses];
 
-    /** The number of floating point instructions that can be issued
-     *  in one cycle.
+    /** Add an op class to the age order list. */
+    void addToOrderList(OpClass op_class);
+
+    /**
+     * Called when the oldest instruction has been removed from a ready queue;
+     * this places that ready queue into the proper spot in the age order list.
      */
-    unsigned floatWidth;
+    void moveToYoungerInst(ListOrderIt age_order_it);
+
+    //////////////////////////////////////
+    // Various parameters
+    //////////////////////////////////////
+
+    /** IQ Resource Sharing Policy */
+    enum IQPolicy {
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+    /** IQ sharing policy for SMT. */
+    IQPolicy iqPolicy;
+
+    /** Number of Total Threads*/
+    unsigned numThreads;
+
+    /** Pointer to list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Per Thread IQ count */
+    unsigned count[Impl::MaxThreads];
 
-    /** The number of branches that can be issued in one cycle. */
-    unsigned branchWidth;
+    /** Max IQ Entries Per Thread */
+    unsigned maxEntries[Impl::MaxThreads];
 
-    /** The number of memory instructions that can be issued in one cycle. */
-    unsigned memoryWidth;
+    /** Number of free IQ entries left. */
+    unsigned freeEntries;
+
+    /** The number of entries in the instruction queue. */
+    unsigned numEntries;
 
     /** The total number of instructions that can be issued in one cycle. */
     unsigned totalWidth;
 
-    //The number of physical registers in the CPU.
+    /** The number of physical registers in the CPU. */
     unsigned numPhysRegs;
 
     /** The number of physical integer registers in the CPU. */
@@ -237,15 +390,12 @@ class InstructionQueue
     //////////////////////////////////
 
     /** The sequence number of the squashed instruction. */
-    InstSeqNum squashedSeqNum;
-
-    /** Iterator that points to the youngest instruction in the IQ. */
-    ListIt tail;
+    InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
     /** Iterator that points to the last instruction that has been squashed.
      *  This will not be valid unless the IQ is in the process of squashing.
      */
-    ListIt squashIt;
+    ListIt squashIt[Impl::MaxThreads];
 
     ///////////////////////////////////
     // Dependency graph stuff
@@ -254,6 +404,10 @@ class InstructionQueue
     class DependencyEntry
     {
       public:
+        DependencyEntry()
+            : inst(NULL), next(NULL)
+        { }
+
         DynInstPtr inst;
         //Might want to include data about what arch. register the
         //dependence is waiting on.
@@ -288,15 +442,17 @@ class InstructionQueue
      *  is basically a secondary scoreboard, and should pretty much mirror
      *  the scoreboard that exists in the rename map.
      */
-    vector<bool> regScoreboard;
+    std::vector<bool> regScoreboard;
 
+    /** Adds an instruction to the dependency graph, as a producer. */
     bool addToDependents(DynInstPtr &new_inst);
-    void insertDependency(DynInstPtr &new_inst);
+
+    /** Adds an instruction to the dependency graph, as a consumer. */
     void createDependency(DynInstPtr &new_inst);
 
+    /** Moves an instruction to the ready queue if it is ready. */
     void addIfReady(DynInstPtr &inst);
 
-  private:
     /** Debugging function to count how many entries are in the IQ.  It does
      *  a linear walk through the instructions, so do not call this function
      *  during normal execution.
@@ -313,24 +469,42 @@ class InstructionQueue
      */
     void dumpLists();
 
+    /** Debugging function to dump out all instructions that are in the
+     *  IQ.
+     */
+    void dumpInsts();
+
+    /** Stat for number of instructions added. */
     Stats::Scalar<> iqInstsAdded;
+    /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
 //    Stats::Scalar<> iqIntInstsAdded;
+    /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
 //    Stats::Scalar<> iqFloatInstsAdded;
+    /** Stat for number of floating point instructions issued. */
     Stats::Scalar<> iqFloatInstsIssued;
 //    Stats::Scalar<> iqBranchInstsAdded;
+    /** Stat for number of branch instructions issued. */
     Stats::Scalar<> iqBranchInstsIssued;
 //    Stats::Scalar<> iqMemInstsAdded;
+    /** Stat for number of memory instructions issued. */
     Stats::Scalar<> iqMemInstsIssued;
 //    Stats::Scalar<> iqMiscInstsAdded;
+    /** Stat for number of miscellaneous instructions issued. */
     Stats::Scalar<> iqMiscInstsIssued;
+    /** Stat for number of squashed instructions that were ready to issue. */
     Stats::Scalar<> iqSquashedInstsIssued;
-    Stats::Scalar<> iqLoopSquashStalls;
+    /** Stat for number of squashed instructions examined when squashing. */
     Stats::Scalar<> iqSquashedInstsExamined;
+    /** Stat for number of squashed instruction operands examined when
+     * squashing.
+     */
     Stats::Scalar<> iqSquashedOperandsExamined;
+    /** Stat for number of non-speculative instructions removed due to a squash.
+     */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
 };
 
-#endif //__CPU_O3_CPU_INST_QUEUE_HH__
+#endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 048dc7c00..cfdd25cd5 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -39,32 +39,63 @@
 
 #include "sim/root.hh"
 
+#include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/inst_queue.hh"
 
-// Either compile error or max int due to sign extension.
-// Hack to avoid compile warnings.
-const InstSeqNum MaxInstSeqNum = std::numeric_limits<InstSeqNum>::max();
+using namespace std;
 
 template <class Impl>
-InstructionQueue<Impl>::InstructionQueue(Params &params)
-    : memDepUnit(params),
-      numEntries(params.numIQEntries),
-      intWidth(params.executeIntWidth),
-      floatWidth(params.executeFloatWidth),
-      branchWidth(params.executeBranchWidth),
-      memoryWidth(params.executeMemoryWidth),
-      totalWidth(params.issueWidth),
-      numPhysIntRegs(params.numPhysIntRegs),
-      numPhysFloatRegs(params.numPhysFloatRegs),
-      commitToIEWDelay(params.commitToIEWDelay)
+InstructionQueue<Impl>::FUCompletion::FUCompletion(DynInstPtr &_inst,
+                                                   int fu_idx,
+                                                   InstructionQueue<Impl> *iq_ptr)
+    : Event(&mainEventQueue, Stat_Event_Pri),
+      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr)
 {
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::FUCompletion::process()
+{
+    iqPtr->processFUCompletion(inst, fuIdx);
+    inst = NULL;
+}
+
+
+template <class Impl>
+const char *
+InstructionQueue<Impl>::FUCompletion::description()
+{
+    return "Functional unit completion event";
+}
+
+template <class Impl>
+InstructionQueue<Impl>::InstructionQueue(Params *params)
+    : dcacheInterface(params->dcacheInterface),
+      fuPool(params->fuPool),
+      numEntries(params->numIQEntries),
+      totalWidth(params->issueWidth),
+      numPhysIntRegs(params->numPhysIntRegs),
+      numPhysFloatRegs(params->numPhysFloatRegs),
+      commitToIEWDelay(params->commitToIEWDelay)
+{
+    assert(fuPool);
+
+    numThreads = params->numberOfThreads;
+
+    //Initialize thread IQ counts
+    for (int i = 0; i <numThreads; i++) {
+        count[i] = 0;
+    }
+
     // Initialize the number of free IQ entries.
     freeEntries = numEntries;
 
     // Set the number of physical registers as the number of int + float
     numPhysRegs = numPhysIntRegs + numPhysFloatRegs;
 
-    DPRINTF(IQ, "IQ: There are %i physical registers.\n", numPhysRegs);
+    DPRINTF(IQ, "There are %i physical registers.\n", numPhysRegs);
 
     //Create an entry for each physical register within the
     //dependency graph.
@@ -73,6 +104,12 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
     // Resize the register scoreboard.
     regScoreboard.resize(numPhysRegs);
 
+    //Initialize Mem Dependence Units
+    for (int i = 0; i < numThreads; i++) {
+        memDepUnit[i].init(params,i);
+        memDepUnit[i].setIQ(this);
+    }
+
     // Initialize all the head pointers to point to NULL, and all the
     // entries as unready.
     // Note that in actuality, the registers corresponding to the logical
@@ -80,13 +117,107 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
     // IQ as the instruction should have been correctly told if those
     // registers are ready in rename.  Thus it can all be initialized as
     // unready.
-    for (int i = 0; i < numPhysRegs; ++i)
-    {
+    for (int i = 0; i < numPhysRegs; ++i) {
         dependGraph[i].next = NULL;
         dependGraph[i].inst = NULL;
         regScoreboard[i] = false;
     }
 
+    for (int i = 0; i < numThreads; ++i) {
+        squashedSeqNum[i] = 0;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queueOnList[i] = false;
+        readyIt[i] = listOrder.end();
+    }
+
+    string policy = params->smtIQPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out resource sharing policy
+    if (policy == "dynamic") {
+        iqPolicy = Dynamic;
+
+        //Set Max Entries to Total ROB Capacity
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = numEntries;
+        }
+
+    } else if (policy == "partitioned") {
+        iqPolicy = Partitioned;
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        int part_amt = numEntries / numThreads;
+
+        //Divide ROB up evenly
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = part_amt;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Partitioned:"
+                "%i entries per thread.\n",part_amt);
+
+    } else if (policy == "threshold") {
+        iqPolicy = Threshold;
+
+        double threshold =  (double)params->smtIQThreshold / 100;
+
+        int thresholdIQ = (int)((double)threshold * numEntries);
+
+        //Divide up by threshold amount
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i] = thresholdIQ;
+        }
+
+        DPRINTF(Fetch, "IQ sharing policy set to Threshold:"
+                "%i entries per thread.\n",thresholdIQ);
+   } else {
+       assert(0 && "Invalid IQ Sharing Policy.Options Are:{Dynamic,"
+              "Partitioned, Threshold}");
+   }
+}
+
+template <class Impl>
+InstructionQueue<Impl>::~InstructionQueue()
+{
+    // Clear the dependency graph
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+
+    for (int i = 0; i < numPhysRegs; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+
+    assert(DependencyEntry::mem_alloc_counter == 0);
+
+    delete [] dependGraph;
+}
+
+template <class Impl>
+std::string
+InstructionQueue<Impl>::name() const
+{
+    return cpu->name() + ".iq";
 }
 
 template <class Impl>
@@ -143,12 +274,6 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed instructions issued")
         .prereq(iqSquashedInstsIssued);
 
-    iqLoopSquashStalls
-        .name(name() + ".iqLoopSquashStalls")
-        .desc("Number of times issue loop had to restart due to squashed "
-              "inst; mainly for profiling")
-        .prereq(iqLoopSquashStalls);
-
     iqSquashedInstsExamined
         .name(name() + ".iqSquashedInstsExamined")
         .desc("Number of squashed instructions iterated over during squash;"
@@ -166,25 +291,25 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
-    // Tell mem dependence unit to reg stats as well.
-    memDepUnit.regStats();
+    for ( int i=0; i < numThreads; i++) {
+        // Tell mem dependence unit to reg stats as well.
+        memDepUnit[i].regStats();
+    }
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
+InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    cpu = cpu_ptr;
-
-    tail = cpu->instList.begin();
+    DPRINTF(IQ, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::setIssueToExecuteQueue(
-                        TimeBuffer<IssueStruct> *i2e_ptr)
+InstructionQueue<Impl>::setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2e_ptr)
 {
-    DPRINTF(IQ, "IQ: Set the issue to execute queue.\n");
+    DPRINTF(IQ, "Set the issue to execute queue.\n");
     issueToExecuteQueue = i2e_ptr;
 }
 
@@ -192,12 +317,44 @@ template <class Impl>
 void
 InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(IQ, "IQ: Set the time buffer.\n");
+    DPRINTF(IQ, "Set the time buffer.\n");
     timeBuffer = tb_ptr;
 
     fromCommit = timeBuffer->getWire(-commitToIEWDelay);
 }
 
+template <class Impl>
+int
+InstructionQueue<Impl>::entryAmount(int num_threads)
+{
+    if (iqPolicy == Partitioned) {
+        return numEntries / num_threads;
+    } else {
+        return 0;
+    }
+}
+
+
+template <class Impl>
+void
+InstructionQueue<Impl>::resetEntries()
+{
+    if (iqPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        while (threads != list_end) {
+            if (iqPolicy == Partitioned) {
+                maxEntries[*threads++] = numEntries / active_threads;
+            } else if(iqPolicy == Threshold && active_threads == 1) {
+                maxEntries[*threads++] = numEntries;
+            }
+        }
+    }
+}
+
 template <class Impl>
 unsigned
 InstructionQueue<Impl>::numFreeEntries()
@@ -205,6 +362,13 @@ InstructionQueue<Impl>::numFreeEntries()
     return freeEntries;
 }
 
+template <class Impl>
+unsigned
+InstructionQueue<Impl>::numFreeEntries(unsigned tid)
+{
+    return maxEntries[tid] - count[tid];
+}
+
 // Might want to do something more complex if it knows how many instructions
 // will be issued this cycle.
 template <class Impl>
@@ -218,6 +382,34 @@ InstructionQueue<Impl>::isFull()
     }
 }
 
+template <class Impl>
+bool
+InstructionQueue<Impl>::isFull(unsigned tid)
+{
+    if (numFreeEntries(tid) == 0) {
+        return(true);
+    } else {
+        return(false);
+    }
+}
+
+template <class Impl>
+bool
+InstructionQueue<Impl>::hasReadyInsts()
+{
+    if (!listOrder.empty()) {
+        return true;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
@@ -225,7 +417,7 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // Make sure the instruction is valid
     assert(new_inst);
 
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
             new_inst->readPC());
 
     // Check if there are any free entries.  Panic if there are none.
@@ -233,26 +425,14 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // panicing.
     assert(freeEntries != 0);
 
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != new_inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
-
+    instList[new_inst->threadNumber].push_back(new_inst);
 
     // Decrease the number of free entries.
     --freeEntries;
 
+    //Mark Instruction as in IQ
+    new_inst->setInIQ();
+
     // Look through its source registers (physical regs), and mark any
     // dependencies.
     addToDependents(new_inst);
@@ -264,9 +444,7 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // If it's a memory instruction, add it to the memory dependency
     // unit.
     if (new_inst->isMemRef()) {
-        memDepUnit.insert(new_inst);
-        // Uh..forgot to look it up and put it on the proper dependency list
-        // if the instruction should not go yet.
+        memDepUnit[new_inst->threadNumber].insert(new_inst);
     } else {
         // If the instruction is ready then add it to the ready list.
         addIfReady(new_inst);
@@ -274,364 +452,327 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
 
     ++iqInstsAdded;
 
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
     assert(freeEntries == (numEntries - countInsts()));
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst)
+InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
 {
-    nonSpecInsts[inst->seqNum] = inst;
-
     // @todo: Clean up this code; can do it by setting inst as unable
     // to issue, then calling normal insert on the inst.
 
     // Make sure the instruction is valid
-    assert(inst);
+    assert(new_inst);
+
+    nonSpecInsts[new_inst->seqNum] = new_inst;
 
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
-            inst->readPC());
+    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
+            new_inst->readPC());
 
     // Check if there are any free entries.  Panic if there are none.
     // Might want to have this return a fault in the future instead of
     // panicing.
     assert(freeEntries != 0);
 
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
+    instList[new_inst->threadNumber].push_back(new_inst);
 
     // Decrease the number of free entries.
     --freeEntries;
 
+    //Mark Instruction as in IQ
+    new_inst->setInIQ();
+
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(inst);
+    createDependency(new_inst);
 
     // If it's a memory instruction, add it to the memory dependency
     // unit.
-    if (inst->isMemRef()) {
-        memDepUnit.insertNonSpec(inst);
+    if (new_inst->isMemRef()) {
+        memDepUnit[new_inst->threadNumber].insertNonSpec(new_inst);
     }
 
     ++iqNonSpecInstsAdded;
+
+    //Update Thread IQ Count
+    count[new_inst->threadNumber]++;
+
+    assert(freeEntries == (numEntries - countInsts()));
 }
 
-// Slightly hack function to advance the tail iterator in the case that
-// the IEW stage issues an instruction that is not added to the IQ.  This
-// is needed in case a long chain of such instructions occurs.
-// I don't think this is used anymore.
 template <class Impl>
 void
-InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
+InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
 {
-    // Make sure the instruction is valid
-    assert(inst);
+    memDepUnit[barr_inst->threadNumber].insertBarrier(barr_inst);
 
-    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
-            inst->readPC());
-
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
-    assert(freeEntries != 0);
-
-    // If the IQ currently has nothing in it, then there's a possibility
-    // that the tail iterator is invalid (might have been pointing at an
-    // instruction that was retired).  Reset the tail iterator.
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.begin();
-    }
-
-    // Move the tail iterator.  Instructions may not have been issued
-    // to the IQ, so we may have to increment the iterator more than once.
-    while ((*tail) != inst) {
-        tail++;
-
-        // Make sure the tail iterator points at something legal.
-        assert(tail != cpu->instList.end());
-    }
-
-    assert(freeEntries <= numEntries);
+    insertNonSpec(barr_inst);
+}
 
+template <class Impl>
+void
+InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
+{
     // Have this instruction set itself as the producer of its destination
     // register(s).
     createDependency(inst);
 }
 
-// Need to make sure the number of float and integer instructions
-// issued does not exceed the total issue bandwidth.
-// @todo: Figure out a better way to remove the squashed items from the
-// lists.  Checking the top item of each list to see if it's squashed
-// wastes time and forces jumps.
 template <class Impl>
 void
-InstructionQueue<Impl>::scheduleReadyInsts()
+InstructionQueue<Impl>::addToOrderList(OpClass op_class)
 {
-    DPRINTF(IQ, "IQ: Attempting to schedule ready instructions from "
-                "the IQ.\n");
-
-    int int_issued = 0;
-    int float_issued = 0;
-    int branch_issued = 0;
-    int memory_issued = 0;
-    int squashed_issued = 0;
-    int total_issued = 0;
-
-    IssueStruct *i2e_info = issueToExecuteQueue->access(0);
-
-    bool insts_available = !readyBranchInsts.empty() ||
-        !readyIntInsts.empty() ||
-        !readyFloatInsts.empty() ||
-        !memDepUnit.empty() ||
-        !readyMiscInsts.empty() ||
-        !squashedInsts.empty();
-
-    // Note: Requires a globally defined constant.
-    InstSeqNum oldest_inst = MaxInstSeqNum;
-    InstList list_with_oldest = None;
-
-    // Temporary values.
-    DynInstPtr int_head_inst;
-    DynInstPtr float_head_inst;
-    DynInstPtr branch_head_inst;
-    DynInstPtr mem_head_inst;
-    DynInstPtr misc_head_inst;
-    DynInstPtr squashed_head_inst;
-
-    // Somewhat nasty code to look at all of the lists where issuable
-    // instructions are located, and choose the oldest instruction among
-    // those lists.  Consider a rewrite in the future.
-    while (insts_available && total_issued < totalWidth)
-    {
-        // Set this to false.  Each if-block is required to set it to true
-        // if there were instructions available this check.  This will cause
-        // this loop to run once more than necessary, but avoids extra calls.
-        insts_available = false;
+    assert(!readyInsts[op_class].empty());
 
-        oldest_inst = MaxInstSeqNum;
+    ListOrderEntry queue_entry;
 
-        list_with_oldest = None;
+    queue_entry.queueType = op_class;
 
-        if (!readyIntInsts.empty() &&
-            int_issued < intWidth) {
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
 
-            insts_available = true;
+    ListOrderIt list_it = listOrder.begin();
+    ListOrderIt list_end_it = listOrder.end();
 
-            int_head_inst = readyIntInsts.top();
-
-            if (int_head_inst->isSquashed()) {
-                readyIntInsts.pop();
+    while (list_it != list_end_it) {
+        if ((*list_it).oldestInst > queue_entry.oldestInst) {
+            break;
+        }
 
-                ++iqLoopSquashStalls;
+        list_it++;
+    }
 
-                continue;
-            }
+    readyIt[op_class] = listOrder.insert(list_it, queue_entry);
+    queueOnList[op_class] = true;
+}
 
-            oldest_inst = int_head_inst->seqNum;
+template <class Impl>
+void
+InstructionQueue<Impl>::moveToYoungerInst(ListOrderIt list_order_it)
+{
+    // Get iterator of next item on the list
+    // Delete the original iterator
+    // Determine if the next item is either the end of the list or younger
+    // than the new instruction.  If so, then add in a new iterator right here.
+    // If not, then move along.
+    ListOrderEntry queue_entry;
+    OpClass op_class = (*list_order_it).queueType;
+    ListOrderIt next_it = list_order_it;
+
+    ++next_it;
+
+    queue_entry.queueType = op_class;
+    queue_entry.oldestInst = readyInsts[op_class].top()->seqNum;
+
+    while (next_it != listOrder.end() &&
+           (*next_it).oldestInst < queue_entry.oldestInst) {
+        ++next_it;
+    }
 
-            list_with_oldest = Int;
-        }
+    readyIt[op_class] = listOrder.insert(next_it, queue_entry);
+}
 
-        if (!readyFloatInsts.empty() &&
-            float_issued < floatWidth) {
+template <class Impl>
+void
+InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
+{
+    // The CPU could have been sleeping until this op completed (*extremely*
+    // long latency op).  Wake it if it was.  This may be overkill.
+    iewStage->wakeCPU();
 
-            insts_available = true;
+    fuPool->freeUnit(fu_idx);
 
-            float_head_inst = readyFloatInsts.top();
+    int &size = issueToExecuteQueue->access(0)->size;
 
-            if (float_head_inst->isSquashed()) {
-                readyFloatInsts.pop();
+    issueToExecuteQueue->access(0)->insts[size++] = inst;
+}
 
-                ++iqLoopSquashStalls;
+// @todo: Figure out a better way to remove the squashed items from the
+// lists.  Checking the top item of each list to see if it's squashed
+// wastes time and forces jumps.
+template <class Impl>
+void
+InstructionQueue<Impl>::scheduleReadyInsts()
+{
+    DPRINTF(IQ, "Attempting to schedule ready instructions from "
+            "the IQ.\n");
 
-                continue;
-            } else if (float_head_inst->seqNum < oldest_inst) {
-                oldest_inst = float_head_inst->seqNum;
+    IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
-                list_with_oldest = Float;
+    // Will need to reorder the list if either a queue is not on the list,
+    // or it has an older instruction than last time.
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        if (!readyInsts[i].empty()) {
+            if (!queueOnList[i]) {
+                addToOrderList(OpClass(i));
+            } else if (readyInsts[i].top()->seqNum  <
+                       (*readyIt[i]).oldestInst) {
+                listOrder.erase(readyIt[i]);
+                addToOrderList(OpClass(i));
             }
         }
+    }
 
-        if (!readyBranchInsts.empty() &&
-            branch_issued < branchWidth) {
+    // Have iterator to head of the list
+    // While I haven't exceeded bandwidth or reached the end of the list,
+    // Try to get a FU that can do what this op needs.
+    // If successful, change the oldestInst to the new top of the list, put
+    // the queue in the proper place in the list.
+    // Increment the iterator.
+    // This will avoid trying to schedule a certain op class if there are no
+    // FUs that handle it.
+    ListOrderIt order_it = listOrder.begin();
+    ListOrderIt order_end_it = listOrder.end();
+    int total_issued = 0;
+    int exec_queue_slot = i2e_info->size;
 
-            insts_available = true;
+    while (exec_queue_slot < totalWidth && order_it != order_end_it) {
+        OpClass op_class = (*order_it).queueType;
 
-            branch_head_inst = readyBranchInsts.top();
+        assert(!readyInsts[op_class].empty());
 
-            if (branch_head_inst->isSquashed()) {
-                readyBranchInsts.pop();
+        DynInstPtr issuing_inst = readyInsts[op_class].top();
 
-                ++iqLoopSquashStalls;
+        assert(issuing_inst->seqNum == (*order_it).oldestInst);
 
-                continue;
-            } else if (branch_head_inst->seqNum < oldest_inst) {
-                oldest_inst = branch_head_inst->seqNum;
+        if (issuing_inst->isSquashed()) {
+            readyInsts[op_class].pop();
 
-                list_with_oldest = Branch;
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
             }
 
-        }
+            listOrder.erase(order_it++);
 
-        if (!memDepUnit.empty() &&
-            memory_issued < memoryWidth) {
+            ++iqSquashedInstsIssued;
 
-            insts_available = true;
-
-            mem_head_inst = memDepUnit.top();
-
-            if (mem_head_inst->isSquashed()) {
-                memDepUnit.pop();
-
-                ++iqLoopSquashStalls;
-
-                continue;
-            } else if (mem_head_inst->seqNum < oldest_inst) {
-                oldest_inst = mem_head_inst->seqNum;
-
-                list_with_oldest = Memory;
-            }
+            continue;
         }
 
-        if (!readyMiscInsts.empty()) {
-
-            insts_available = true;
+        int idx = fuPool->getUnit(op_class);
 
-            misc_head_inst = readyMiscInsts.top();
+        if (idx == -2) {
+            assert(op_class == No_OpClass);
 
-            if (misc_head_inst->isSquashed()) {
-                readyMiscInsts.pop();
+            i2e_info->insts[exec_queue_slot++] = issuing_inst;
+            i2e_info->size++;
 
-                ++iqLoopSquashStalls;
+            DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
+                    " %#x [sn:%lli]\n",
+                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    issuing_inst->seqNum);
 
-                continue;
-            } else if (misc_head_inst->seqNum < oldest_inst) {
-                oldest_inst = misc_head_inst->seqNum;
+            readyInsts[op_class].pop();
 
-                list_with_oldest = Misc;
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
             }
-        }
-
-        if (!squashedInsts.empty()) {
 
-            insts_available = true;
-
-            squashed_head_inst = squashedInsts.top();
+            issuing_inst->setIssued();
+            ++total_issued;
 
-            if (squashed_head_inst->seqNum < oldest_inst) {
-                list_with_oldest = Squashed;
+            if (!issuing_inst->isMemRef()) {
+                // Memory instructions can not be freed from the IQ until they
+                // complete.
+                ++freeEntries;
+                count[issuing_inst->threadNumber]--;
+                issuing_inst->removeInIQ();
+            } else {
+                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
             }
 
-        }
-
-        DynInstPtr issuing_inst = NULL;
-
-        switch (list_with_oldest) {
-          case None:
-            DPRINTF(IQ, "IQ: Not able to schedule any instructions. Issuing "
-                    "inst is %#x.\n", issuing_inst);
-            break;
-
-          case Int:
-            issuing_inst = int_head_inst;
-            readyIntInsts.pop();
-            ++int_issued;
-            DPRINTF(IQ, "IQ: Issuing integer instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
+            listOrder.erase(order_it++);
 
-          case Float:
-            issuing_inst = float_head_inst;
-            readyFloatInsts.pop();
-            ++float_issued;
-            DPRINTF(IQ, "IQ: Issuing float instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
+        } else if (idx != -1) {
+            int op_latency = fuPool->getOpLatency(op_class);
 
-          case Branch:
-            issuing_inst = branch_head_inst;
-            readyBranchInsts.pop();
-            ++branch_issued;
-            DPRINTF(IQ, "IQ: Issuing branch instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
+            if (op_latency == 1) {
+                i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                i2e_info->size++;
 
-          case Memory:
-            issuing_inst = mem_head_inst;
+                // Add the FU onto the list of FU's to be freed next cycle.
+                fuPool->freeUnit(idx);
+            } else {
+                int issue_latency = fuPool->getIssueLatency(op_class);
 
-            memDepUnit.pop();
-            ++memory_issued;
-            DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
+                if (issue_latency > 1) {
+                    // Generate completion event for the FU
+                    FUCompletion *execution = new FUCompletion(issuing_inst,
+                                                               idx, this);
 
-          case Misc:
-            issuing_inst = misc_head_inst;
-            readyMiscInsts.pop();
+                    execution->schedule(curTick + issue_latency - 1);
+                } else {
+                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
+                    i2e_info->size++;
 
-            ++iqMiscInstsIssued;
+                    // Add the FU onto the list of FU's to be freed next cycle.
+                    fuPool->freeUnit(idx);
+                }
+            }
 
-            DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n",
-                    issuing_inst->readPC());
-            break;
+            DPRINTF(IQ, "Thread %i: Issuing instruction PC %#x "
+                    "[sn:%lli]\n",
+                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    issuing_inst->seqNum);
 
-          case Squashed:
-            assert(0 && "Squashed insts should not issue any more!");
-            squashedInsts.pop();
-            // Set the squashed instruction as able to commit so that commit
-            // can just drop it from the ROB.  This is a bit faked.
-            ++squashed_issued;
-            ++freeEntries;
+            readyInsts[op_class].pop();
 
-            DPRINTF(IQ, "IQ: Issuing squashed instruction PC %#x.\n",
-                    squashed_head_inst->readPC());
-            break;
-        }
-
-        if (list_with_oldest != None && list_with_oldest != Squashed) {
-            i2e_info->insts[total_issued] = issuing_inst;
-            i2e_info->size++;
+            if (!readyInsts[op_class].empty()) {
+                moveToYoungerInst(order_it);
+            } else {
+                readyIt[op_class] = listOrder.end();
+                queueOnList[op_class] = false;
+            }
 
             issuing_inst->setIssued();
-
-            ++freeEntries;
             ++total_issued;
-        }
 
-        assert(freeEntries == (numEntries - countInsts()));
+            if (!issuing_inst->isMemRef()) {
+                // Memory instructions can not be freed from the IQ until they
+                // complete.
+                ++freeEntries;
+                count[issuing_inst->threadNumber]--;
+                issuing_inst->removeInIQ();
+            } else {
+                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+            }
+
+            listOrder.erase(order_it++);
+        } else {
+            ++order_it;
+        }
     }
 
-    iqIntInstsIssued += int_issued;
-    iqFloatInstsIssued += float_issued;
-    iqBranchInstsIssued += branch_issued;
-    iqMemInstsIssued += memory_issued;
-    iqSquashedInstsIssued += squashed_issued;
+    if (total_issued) {
+        cpu->activityThisCycle();
+    } else {
+        DPRINTF(IQ, "Not able to schedule any instructions.\n");
+    }
 }
 
 template <class Impl>
 void
 InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 {
-    DPRINTF(IQ, "IQ: Marking nonspeculative instruction with sequence "
-            "number %i as ready to execute.\n", inst);
+    DPRINTF(IQ, "Marking nonspeculative instruction [sn:%lli] as ready "
+            "to execute.\n", inst);
 
-    non_spec_it_t inst_it = nonSpecInsts.find(inst);
+    NonSpecMapIt inst_it = nonSpecInsts.find(inst);
 
     assert(inst_it != nonSpecInsts.end());
 
+    unsigned tid = (*inst_it).second->threadNumber;
+
     // Mark this instruction as ready to issue.
     (*inst_it).second->setCanIssue();
 
@@ -639,27 +780,58 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
     if (!(*inst_it).second->isMemRef()) {
         addIfReady((*inst_it).second);
     } else {
-        memDepUnit.nonSpecInstReady((*inst_it).second);
+        memDepUnit[tid].nonSpecInstReady((*inst_it).second);
     }
 
+    (*inst_it).second = NULL;
+
     nonSpecInsts.erase(inst_it);
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
+{
+    /*Need to go through each thread??*/
+    DPRINTF(IQ, "[tid:%i]: Committing instructions older than [sn:%i]\n",
+            tid,inst);
+
+    ListIt iq_it = instList[tid].begin();
+
+    while (iq_it != instList[tid].end() &&
+           (*iq_it)->seqNum <= inst) {
+        ++iq_it;
+        instList[tid].pop_front();
+    }
+
+    assert(freeEntries == (numEntries - countInsts()));
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 {
-    DPRINTF(IQ, "IQ: Waking dependents of completed instruction.\n");
-    //Look at the physical destination register of the DynInst
-    //and look it up on the dependency graph.  Then mark as ready
-    //any instructions within the instruction queue.
+    DPRINTF(IQ, "Waking dependents of completed instruction.\n");
+
+    assert(!completed_inst->isSquashed());
+    // Look at the physical destination register of the DynInst
+    // and look it up on the dependency graph.  Then mark as ready
+    // any instructions within the instruction queue.
     DependencyEntry *curr;
+    DependencyEntry *prev;
 
     // Tell the memory dependence unit to wake any dependents on this
-    // instruction if it is a memory instruction.
-
+    // instruction if it is a memory instruction.  Also complete the memory
+    // instruction at this point since we know it executed fine.
+    // @todo: Might want to rename "completeMemInst" to
+    // something that indicates that it won't need to be replayed, and call
+    // this earlier.  Might not be a big deal.
     if (completed_inst->isMemRef()) {
-        memDepUnit.wakeDependents(completed_inst);
+        memDepUnit[completed_inst->threadNumber].wakeDependents(completed_inst);
+        completeMemInst(completed_inst);
+    } else if (completed_inst->isMemBarrier() ||
+               completed_inst->isWriteBarrier()) {
+        memDepUnit[completed_inst->threadNumber].completeBarrier(completed_inst);
     }
 
     for (int dest_reg_idx = 0;
@@ -676,17 +848,17 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
             continue;
         }
 
-        DPRINTF(IQ, "IQ: Waking any dependents on register %i.\n",
+        DPRINTF(IQ, "Waking any dependents on register %i.\n",
                 (int) dest_reg);
 
         //Maybe abstract this part into a function.
         //Go through the dependency chain, marking the registers as ready
         //within the waiting instructions.
-        while (dependGraph[dest_reg].next) {
 
-            curr = dependGraph[dest_reg].next;
+        curr = dependGraph[dest_reg].next;
 
-            DPRINTF(IQ, "IQ: Waking up a dependent instruction, PC%#x.\n",
+        while (curr) {
+            DPRINTF(IQ, "Waking up a dependent instruction, PC%#x.\n",
                     curr->inst->readPC());
 
             // Might want to give more information to the instruction
@@ -697,13 +869,13 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 
             addIfReady(curr->inst);
 
-            dependGraph[dest_reg].next = curr->next;
-
             DependencyEntry::mem_alloc_counter--;
 
-            curr->inst = NULL;
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
 
-            delete curr;
+            delete prev;
         }
 
         // Reset the head node now that all of its dependents have been woken
@@ -716,63 +888,116 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
     }
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
+{
+    OpClass op_class = ready_inst->opClass();
+
+    readyInsts[op_class].push(ready_inst);
+
+    DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+            "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+            ready_inst->readPC(), op_class, ready_inst->seqNum);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::rescheduleMemInst(DynInstPtr &resched_inst)
+{
+    memDepUnit[resched_inst->threadNumber].reschedule(resched_inst);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::replayMemInst(DynInstPtr &replay_inst)
+{
+    memDepUnit[replay_inst->threadNumber].replay(replay_inst);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::completeMemInst(DynInstPtr &completed_inst)
+{
+    int tid = completed_inst->threadNumber;
+
+    DPRINTF(IQ, "Completing mem instruction PC:%#x [sn:%lli]\n",
+            completed_inst->readPC(), completed_inst->seqNum);
+
+    ++freeEntries;
+
+    completed_inst->memOpDone = true;
+
+    memDepUnit[tid].completed(completed_inst);
+
+    count[tid]--;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::violation(DynInstPtr &store,
                                   DynInstPtr &faulting_load)
 {
-    memDepUnit.violation(store, faulting_load);
+    memDepUnit[store->threadNumber].violation(store, faulting_load);
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::squash()
+InstructionQueue<Impl>::squash(unsigned tid)
 {
-    DPRINTF(IQ, "IQ: Starting to squash instructions in the IQ.\n");
+    DPRINTF(IQ, "[tid:%i]: Starting to squash instructions in "
+            "the IQ.\n", tid);
 
     // Read instruction sequence number of last instruction out of the
     // time buffer.
-    squashedSeqNum = fromCommit->commitInfo.doneSeqNum;
+    squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
 
     // Setup the squash iterator to point to the tail.
-    squashIt = tail;
+    squashIt[tid] = instList[tid].end();
+    --squashIt[tid];
 
     // Call doSquash if there are insts in the IQ
-    if (freeEntries != numEntries) {
-        doSquash();
+    if (count[tid] > 0) {
+        doSquash(tid);
     }
 
     // Also tell the memory dependence unit to squash.
-    memDepUnit.squash(squashedSeqNum);
+    memDepUnit[tid].squash(squashedSeqNum[tid], tid);
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::doSquash()
+InstructionQueue<Impl>::doSquash(unsigned tid)
 {
-    // Make sure the squash iterator isn't pointing to nothing.
-    assert(squashIt != cpu->instList.end());
     // Make sure the squashed sequence number is valid.
-    assert(squashedSeqNum != 0);
+//    assert(squashedSeqNum[tid] != 0);
 
-    DPRINTF(IQ, "IQ: Squashing instructions in the IQ.\n");
+    DPRINTF(IQ, "[tid:%i]: Squashing until sequence number %i!\n",
+            tid, squashedSeqNum[tid]);
 
     // Squash any instructions younger than the squashed sequence number
     // given.
-    while ((*squashIt)->seqNum > squashedSeqNum) {
-        DynInstPtr squashed_inst = (*squashIt);
+    while (squashIt[tid] != instList[tid].end() &&
+           (*squashIt[tid])->seqNum > squashedSeqNum[tid]) {
+
+        DynInstPtr squashed_inst = (*squashIt[tid]);
 
         // Only handle the instruction if it actually is in the IQ and
         // hasn't already been squashed in the IQ.
-        if (!squashed_inst->isIssued() &&
-            !squashed_inst->isSquashedInIQ()) {
+        if (squashed_inst->threadNumber != tid ||
+            squashed_inst->isSquashedInIQ()) {
+            --squashIt[tid];
+            continue;
+        }
+
+        if (!squashed_inst->isIssued() ||
+            (squashed_inst->isMemRef() &&
+             !squashed_inst->memOpDone)) {
 
             // Remove the instruction from the dependency list.
-            // Hack for now: These below don't add themselves to the
-            // dependency list, so don't try to remove them.
-            if (!squashed_inst->isNonSpeculative()/* &&
-                                                     !squashed_inst->isStore()*/
-                ) {
+            if (!squashed_inst->isNonSpeculative() &&
+                !squashed_inst->isMemBarrier() &&
+                !squashed_inst->isWriteBarrier()) {
 
                 for (int src_reg_idx = 0;
                      src_reg_idx < squashed_inst->numSrcRegs();
@@ -787,19 +1012,29 @@ InstructionQueue<Impl>::doSquash()
                     // dependency chain aren't informed that a specific src
                     // register has become ready.  This may not always be true
                     // in the future.
+                    // Instead of doing a linked list traversal, we can just
+                    // remove these squashed instructions either at issue time,
+                    // or when the register is overwritten.  The only downside
+                    // to this is it leaves more room for error.
+
                     if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
                         src_reg < numPhysRegs) {
                         dependGraph[src_reg].remove(squashed_inst);
                     }
 
+
                     ++iqSquashedOperandsExamined;
                 }
 
                 // Might want to remove producers as well.
             } else {
-                nonSpecInsts[squashed_inst->seqNum] = NULL;
+                NonSpecMapIt ns_inst_it =
+                    nonSpecInsts.find(squashed_inst->seqNum);
+                assert(ns_inst_it != nonSpecInsts.end());
+
+                (*ns_inst_it).second = NULL;
 
-                nonSpecInsts.erase(squashed_inst->seqNum);
+                nonSpecInsts.erase(ns_inst_it);
 
                 ++iqSquashedNonSpecRemoved;
             }
@@ -809,37 +1044,30 @@ InstructionQueue<Impl>::doSquash()
             // Mark it as squashed within the IQ.
             squashed_inst->setSquashedInIQ();
 
-//            squashedInsts.push(squashed_inst);
+            // @todo: Remove this hack where several statuses are set so the
+            // inst will flow through the rest of the pipeline.
             squashed_inst->setIssued();
             squashed_inst->setCanCommit();
+            squashed_inst->removeInIQ();
+
+            //Update Thread IQ Count
+            count[squashed_inst->threadNumber]--;
 
             ++freeEntries;
 
-            DPRINTF(IQ, "IQ: Instruction PC %#x squashed.\n",
-                    squashed_inst->readPC());
+            if (numThreads > 1) {
+                DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+                        "squashed.\n",
+                        tid, squashed_inst->seqNum, squashed_inst->readPC());
+            } else {
+                DPRINTF(IQ, "Instruction [sn:%lli] PC %#x squashed.\n",
+                        squashed_inst->seqNum, squashed_inst->readPC());
+            }
         }
 
-        --squashIt;
+        instList[tid].erase(squashIt[tid]--);
         ++iqSquashedInstsExamined;
     }
-
-    assert(freeEntries <= numEntries);
-
-    if (freeEntries == numEntries) {
-        tail = cpu->instList.end();
-    }
-
-}
-
-template <class Impl>
-void
-InstructionQueue<Impl>::stopSquash()
-{
-    // Clear up the squash variables to ensure that squashing doesn't
-    // get called improperly.
-    squashedSeqNum = 0;
-
-    squashIt = cpu->instList.end();
 }
 
 template <class Impl>
@@ -877,8 +1105,7 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
     }
 
     // Find the instruction to remove within the dependency linked list.
-    while(curr->inst != inst_to_remove)
-    {
+    while (curr->inst != inst_to_remove) {
         prev = curr;
         curr = curr->next;
 
@@ -920,7 +1147,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
             if (src_reg >= numPhysRegs) {
                 continue;
             } else if (regScoreboard[src_reg] == false) {
-                DPRINTF(IQ, "IQ: Instruction PC %#x has src reg %i that "
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
                         "is being added to the dependency chain.\n",
                         new_inst->readPC(), src_reg);
 
@@ -930,7 +1157,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                 // was added to the dependency graph.
                 return_val = true;
             } else {
-                DPRINTF(IQ, "IQ: Instruction PC %#x has src reg %i that "
+                DPRINTF(IQ, "Instruction PC %#x has src reg %i that "
                         "became ready before it reached the IQ.\n",
                         new_inst->readPC(), src_reg);
                 // Mark a register ready within the instruction.
@@ -966,13 +1193,13 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
             continue;
         }
 
-        dependGraph[dest_reg].inst = new_inst;
-
         if (dependGraph[dest_reg].next) {
             dumpDependGraph();
-            panic("IQ: Dependency graph not empty!");
+            panic("Dependency graph %i not empty!", dest_reg);
         }
 
+        dependGraph[dest_reg].inst = new_inst;
+
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
     }
@@ -987,96 +1214,62 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
     if (inst->readyToIssue()) {
 
         //Add the instruction to the proper ready list.
-        if (inst->isControl()) {
-
-            DPRINTF(IQ, "IQ: Branch instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyBranchInsts.push(inst);
-
-        } else if (inst->isMemRef()) {
+        if (inst->isMemRef()) {
 
-            DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n");
+            DPRINTF(IQ, "Checking if memory instruction can issue.\n");
 
             // Message to the mem dependence unit that this instruction has
             // its registers ready.
 
-            memDepUnit.regsReady(inst);
-
-#if 0
-            if (memDepUnit.readyToIssue(inst)) {
-                DPRINTF(IQ, "IQ: Memory instruction is ready to issue, "
-                        "putting it onto the ready list, PC %#x.\n",
-                        inst->readPC());
-                readyMemInsts.push(inst);
-            } else {
-                // Make dependent on the store.
-                // Will need some way to get the store instruction it should
-                // be dependent upon; then when the store issues it can
-                // put the instruction on the ready list.
-                // Yet another tree?
-                assert(0 && "Instruction has no way to actually issue");
-            }
-#endif
+            memDepUnit[inst->threadNumber].regsReady(inst);
 
-        } else if (inst->isInteger()) {
-
-            DPRINTF(IQ, "IQ: Integer instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyIntInsts.push(inst);
-
-        } else if (inst->isFloating()) {
+            return;
+        }
 
-            DPRINTF(IQ, "IQ: Floating instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyFloatInsts.push(inst);
+        OpClass op_class = inst->opClass();
 
-        } else {
-            DPRINTF(IQ, "IQ: Miscellaneous instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x..\n",
-                    inst->readPC());
+        DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
+                "the ready list, PC %#x opclass:%i [sn:%lli].\n",
+                inst->readPC(), op_class, inst->seqNum);
 
-            readyMiscInsts.push(inst);
-        }
+        readyInsts[op_class].push(inst);
     }
 }
 
-/*
- * Caution, this function must not be called prior to tail being updated at
- * least once, otherwise it will fail the assertion.  This is because
- * instList.begin() actually changes upon the insertion of an element into the
- * list when the list is empty.
- */
 template <class Impl>
 int
 InstructionQueue<Impl>::countInsts()
 {
-    ListIt count_it = cpu->instList.begin();
+    //ksewell:This works but definitely could use a cleaner write
+    //with a more intuitive way of counting. Right now it's
+    //just brute force ....
+
+#if 0
     int total_insts = 0;
 
-    if (tail == cpu->instList.end())
-        return 0;
+    for (int i = 0; i < numThreads; ++i) {
+        ListIt count_it = instList[i].begin();
+
+        while (count_it != instList[i].end()) {
+            if (!(*count_it)->isSquashed() && !(*count_it)->isSquashedInIQ()) {
+                if (!(*count_it)->isIssued()) {
+                    ++total_insts;
+                } else if ((*count_it)->isMemRef() &&
+                           !(*count_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++total_insts;
+                }
+            }
 
-    while (count_it != tail) {
-        if (!(*count_it)->isIssued()) {
-            ++total_insts;
+            ++count_it;
         }
-
-        ++count_it;
-
-        assert(count_it != cpu->instList.end());
-    }
-
-    // Need to count the tail iterator as well.
-    if (count_it != cpu->instList.end() &&
-        (*count_it) &&
-        !(*count_it)->isIssued()) {
-        ++total_insts;
     }
 
     return total_insts;
+#else
+    return numEntries - freeEntries;
+#endif
 }
 
 template <class Impl>
@@ -1090,8 +1283,8 @@ InstructionQueue<Impl>::dumpDependGraph()
         curr = &dependGraph[i];
 
         if (curr->inst) {
-            cprintf("dependGraph[%i]: producer: %#x consumer: ", i,
-                    curr->inst->readPC());
+            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
+                    i, curr->inst->readPC(), curr->inst->seqNum);
         } else {
             cprintf("dependGraph[%i]: No producer. consumer: ", i);
         }
@@ -1099,7 +1292,8 @@ InstructionQueue<Impl>::dumpDependGraph()
         while (curr->next != NULL) {
             curr = curr->next;
 
-            cprintf("%#x ", curr->inst->readPC());
+            cprintf("%#x [sn:%lli] ",
+                    curr->inst->readPC(), curr->inst->seqNum);
         }
 
         cprintf("\n");
@@ -1110,27 +1304,87 @@ template <class Impl>
 void
 InstructionQueue<Impl>::dumpLists()
 {
-    cprintf("Ready integer list size: %i\n", readyIntInsts.size());
-
-    cprintf("Ready float list size: %i\n", readyFloatInsts.size());
-
-    cprintf("Ready branch list size: %i\n", readyBranchInsts.size());
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        cprintf("Ready list %i size: %i\n", i, readyInsts[i].size());
 
-    cprintf("Ready misc list size: %i\n", readyMiscInsts.size());
-
-    cprintf("Squashed list size: %i\n", squashedInsts.size());
+        cprintf("\n");
+    }
 
     cprintf("Non speculative list size: %i\n", nonSpecInsts.size());
 
-    non_spec_it_t non_spec_it = nonSpecInsts.begin();
+    NonSpecMapIt non_spec_it = nonSpecInsts.begin();
+    NonSpecMapIt non_spec_end_it = nonSpecInsts.end();
 
     cprintf("Non speculative list: ");
 
-    while (non_spec_it != nonSpecInsts.end()) {
-        cprintf("%#x ", (*non_spec_it).second->readPC());
+    while (non_spec_it != non_spec_end_it) {
+        cprintf("%#x [sn:%lli]", (*non_spec_it).second->readPC(),
+                (*non_spec_it).second->seqNum);
         ++non_spec_it;
     }
 
     cprintf("\n");
 
+    ListOrderIt list_order_it = listOrder.begin();
+    ListOrderIt list_order_end_it = listOrder.end();
+    int i = 1;
+
+    cprintf("List order: ");
+
+    while (list_order_it != list_order_end_it) {
+        cprintf("%i OpClass:%i [sn:%lli] ", i, (*list_order_it).queueType,
+                (*list_order_it).oldestInst);
+
+        ++list_order_it;
+        ++i;
+    }
+
+    cprintf("\n");
+}
+
+
+template <class Impl>
+void
+InstructionQueue<Impl>::dumpInsts()
+{
+    for (int i = 0; i < numThreads; ++i) {
+        int num = 0;
+        int valid_num = 0;
+        ListIt inst_list_it = instList[i].begin();
+
+        while (inst_list_it != instList[i].end())
+        {
+            cprintf("Instruction:%i\n",
+                    num);
+            if (!(*inst_list_it)->isSquashed()) {
+                if (!(*inst_list_it)->isIssued()) {
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                } else if ((*inst_list_it)->isMemRef() &&
+                           !(*inst_list_it)->memOpDone) {
+                    // Loads that have not been marked as executed still count
+                    // towards the total instructions.
+                    ++valid_num;
+                    cprintf("Count:%i\n", valid_num);
+                }
+            }
+
+            cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n"
+                    "Issued:%i\nSquashed:%i\n",
+                    (*inst_list_it)->readPC(),
+                    (*inst_list_it)->seqNum,
+                    (*inst_list_it)->threadNumber,
+                    (*inst_list_it)->isIssued(),
+                    (*inst_list_it)->isSquashed());
+
+            if ((*inst_list_it)->isMemRef()) {
+                cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone);
+            }
+
+            cprintf("\n");
+
+            inst_list_it++;
+            ++num;
+        }
+    }
 }
diff --git a/cpu/o3/lsq.cc b/cpu/o3/lsq.cc
new file mode 100644
index 000000000..8991ab8f8
--- /dev/null
+++ b/cpu/o3/lsq.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_cpu.hh"
+#include "cpu/o3/alpha_impl.hh"
+#include "cpu/o3/lsq_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class LSQ<AlphaSimpleImpl>;
+
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
new file mode 100644
index 000000000..c59b5f13b
--- /dev/null
+++ b/cpu/o3/lsq.hh
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_LSQ_HH__
+#define __CPU_O3_LSQ_HH__
+
+#include <map>
+#include <queue>
+
+#include "base/hashmap.hh"
+#include "config/full_system.hh"
+#include "cpu/inst_seq.hh"
+#include "cpu/o3/cpu_policy.hh"
+#include "cpu/o3/lsq_unit.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+
+template <class Impl>
+class LSQ {
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::CPUPol::IEW IEW;
+    typedef typename Impl::CPUPol::LSQUnit LSQUnit;
+
+    enum LSQPolicy {
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+    /** Constructs an LSQ with the given parameters. */
+    LSQ(Params *params);
+
+    /** Returns the name of the LSQ. */
+    std::string name() const;
+
+    /** Sets the pointer to the list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr);
+    /** Sets the IEW stage pointer. */
+    void setIEW(IEW *iew_ptr);
+    /** Sets the page table pointer. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Number of entries needed for the given amount of threads.*/
+    int entryAmount(int num_threads);
+    void removeEntries(unsigned tid);
+    /** Reset the max entries for each thread. */
+    void resetEntries();
+    /** Resize the max entries for a thread. */
+    void resizeEntries(unsigned size, unsigned tid);
+
+    /** Ticks the LSQ. */
+    void tick();
+    /** Ticks a specific LSQ Unit. */
+    void tick(unsigned tid);
+
+    /** Inserts a load into the LSQ. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store into the LSQ. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load. */
+    Fault executeLoad(DynInstPtr &inst);
+
+    Fault executeLoad(int lq_idx, unsigned tid);
+    /** Executes a store. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /**
+     * Commits loads up until the given sequence number for a specific thread.
+     */
+    void commitLoads(InstSeqNum &youngest_inst, unsigned tid);
+    /**
+     * Commits stores up until the given sequence number for a specific thread.
+     */
+    void commitStores(InstSeqNum &youngest_inst, unsigned tid);
+
+    /**
+     * Attempts to write back stores until all cache ports are used or the
+     * interface becomes blocked.
+     */
+    void writebackStores();
+    /** Same as above, but only for one thread. */
+    void writebackStores(unsigned tid);
+
+    /**
+     * Squash instructions from a thread until the specified sequence number.
+     */
+    void squash(const InstSeqNum &squashed_num, unsigned tid);
+
+    /** Returns whether or not there was a memory ordering violation. */
+    bool violation();
+    /**
+     * Returns whether or not there was a memory ordering violation for a
+     * specific thread.
+     */
+    bool violation(unsigned tid);
+
+    /** Returns if a load is blocked due to the memory system for a specific
+     *  thread.
+     */
+    bool loadBlocked(unsigned tid);
+
+    bool isLoadBlockedHandled(unsigned tid)
+    { return thread[tid].isLoadBlockedHandled(); }
+
+    void setLoadBlockedHandled(unsigned tid)
+    { thread[tid].setLoadBlockedHandled(); }
+
+    /** Gets the instruction that caused the memory ordering violation. */
+    DynInstPtr getMemDepViolator(unsigned tid);
+
+    /** Returns the head index of the load queue for a specific thread. */
+    int getLoadHead(unsigned tid);
+    /** Returns the sequence number of the head of the load queue. */
+    InstSeqNum getLoadHeadSeqNum(unsigned tid)
+    {
+        return thread[tid].getLoadHeadSeqNum();
+    }
+
+    /** Returns the head index of the store queue. */
+    int getStoreHead(unsigned tid);
+    /** Returns the sequence number of the head of the store queue. */
+    InstSeqNum getStoreHeadSeqNum(unsigned tid)
+    {
+        return thread[tid].getStoreHeadSeqNum();
+    }
+
+    /** Returns the number of instructions in all of the queues. */
+    int getCount();
+    /** Returns the number of instructions in the queues of one thread. */
+    int getCount(unsigned tid);
+
+    /** Returns the total number of loads in the load queue. */
+    int numLoads();
+    /** Returns the total number of loads for a single thread. */
+    int numLoads(unsigned tid);
+
+    /** Returns the total number of stores in the store queue. */
+    int numStores();
+    /** Returns the total number of stores for a single thread. */
+    int numStores(unsigned tid);
+
+    /** Returns the total number of loads that are ready. */
+    int numLoadsReady();
+    /** Returns the number of loads that are ready for a single thread. */
+    int numLoadsReady(unsigned tid);
+
+    /** Returns the number of free entries. */
+    unsigned numFreeEntries();
+    /** Returns the number of free entries for a specific thread. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns if the LSQ is full (either LQ or SQ is full). */
+    bool isFull();
+    /**
+     * Returns if the LSQ is full for a specific thread (either LQ or SQ is
+     * full).
+     */
+    bool isFull(unsigned tid);
+
+    /** Returns if any of the LQs are full. */
+    bool lqFull();
+    /** Returns if the LQ of a given thread is full. */
+    bool lqFull(unsigned tid);
+
+    /** Returns if any of the SQs are full. */
+    bool sqFull();
+    /** Returns if the SQ of a given thread is full. */
+    bool sqFull(unsigned tid);
+
+    /**
+     * Returns if the LSQ is stalled due to a memory operation that must be
+     * replayed.
+     */
+    bool isStalled();
+    /**
+     * Returns if the LSQ of a specific thread is stalled due to a memory
+     * operation that must be replayed.
+     */
+    bool isStalled(unsigned tid);
+
+    /** Returns whether or not there are any stores to write back to memory. */
+    bool hasStoresToWB();
+    /** Returns whether or not a specific thread has any stores to write back
+     * to memory.
+     */
+    bool hasStoresToWB(unsigned tid);
+    /** Returns the number of stores a specific thread has to write back. */
+    int  numStoresToWB(unsigned tid);
+
+    /** Returns if the LSQ will write back to memory this cycle. */
+    bool willWB();
+    /** Returns if the LSQ of a specific thread will write back to memory this
+     * cycle.
+     */
+    bool willWB(unsigned tid);
+
+    /** Debugging function to print out all instructions. */
+    void dumpInsts();
+    /** Debugging function to print out instructions from a specific thread. */
+    void dumpInsts(unsigned tid);
+
+    /** Executes a read operation, using the load specified at the load index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes a store operation, using the store specified at the store
+     *   index.
+     */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+  private:
+    /** The LSQ policy for SMT mode. */
+    LSQPolicy lsqPolicy;
+
+    /** The LSQ units for individual threads. */
+    LSQUnit thread[Impl::MaxThreads];
+
+    /** The CPU pointer. */
+    FullCPU *cpu;
+
+    /** The IEW stage pointer. */
+    IEW *iewStage;
+
+    /** The pointer to the page table. */
+//    PageTable *pTable;
+
+    /** List of Active Threads in System. */
+    std::list<unsigned> *activeThreads;
+
+    /** Total Size of LQ Entries. */
+    unsigned LQEntries;
+    /** Total Size of SQ Entries. */
+    unsigned SQEntries;
+
+    /** Max LQ Size - Used to Enforce Sharing Policies. */
+    unsigned maxLQEntries;
+
+    /** Max SQ Size - Used to Enforce Sharing Policies. */
+    unsigned maxSQEntries;
+
+    /** Global Load Count. */
+    int loads;
+
+    /** Global Store Count */
+    int stores;
+
+    /** Global Store To WB Count */
+    int storesToWB;
+
+    /** Number of Threads. */
+    unsigned numThreads;
+};
+
+template <class Impl>
+template <class T>
+Fault
+LSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    unsigned tid = req->thread_num;
+
+    return thread[tid].read(req, data, load_idx);
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQ<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    unsigned tid = req->thread_num;
+
+    return thread[tid].write(req, data, store_idx);
+}
+
+#endif // __CPU_O3_LSQ_HH__
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
new file mode 100644
index 000000000..523517869
--- /dev/null
+++ b/cpu/o3/lsq_impl.hh
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/lsq.hh"
+
+using namespace std;
+
+template <class Impl>
+LSQ<Impl>::LSQ(Params *params)
+    : LQEntries(params->LQEntries), SQEntries(params->SQEntries),
+      loads(0), stores(0), storesToWB(0),
+      numThreads(params->numberOfThreads)
+{
+    DPRINTF(LSQ, "Creating LSQ object.\n");
+
+    //**********************************************/
+    //************ Handle SMT Parameters ***********/
+    //**********************************************/
+    string policy = params->smtLSQPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out fetch policy
+    if (policy == "dynamic") {
+        lsqPolicy = Dynamic;
+
+        maxLQEntries = LQEntries;
+        maxSQEntries = SQEntries;
+
+        DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
+
+    } else if (policy == "partitioned") {
+        lsqPolicy = Partitioned;
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        maxLQEntries = LQEntries / numThreads;
+        maxSQEntries = SQEntries / numThreads;
+
+        DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
+                "%i entries per LQ | %i entries per SQ",
+                maxLQEntries,maxSQEntries);
+
+    } else if (policy == "threshold") {
+        lsqPolicy = Threshold;
+
+        assert(params->smtLSQThreshold > LQEntries);
+        assert(params->smtLSQThreshold > SQEntries);
+
+        //Divide up by threshold amount
+        //@todo: Should threads check the max and the total
+        //amount of the LSQ
+        maxLQEntries  = params->smtLSQThreshold;
+        maxSQEntries  = params->smtLSQThreshold;
+
+        DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
+                "%i entries per LQ | %i entries per SQ",
+                maxLQEntries,maxSQEntries);
+
+    } else {
+        assert(0 && "Invalid LSQ Sharing Policy.Options Are:{Dynamic,"
+                    "Partitioned, Threshold}");
+    }
+
+    //Initialize LSQs
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].init(params, maxLQEntries+1, maxSQEntries+1, tid);
+    }
+}
+
+
+template<class Impl>
+std::string
+LSQ<Impl>::name() const
+{
+    return iewStage->name() + ".lsq";
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setActiveThreads(list<unsigned> *at_ptr)
+{
+    activeThreads = at_ptr;
+    assert(activeThreads != 0);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setCPU(FullCPU *cpu_ptr)
+{
+    cpu = cpu_ptr;
+
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setCPU(cpu_ptr);
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::setIEW(IEW *iew_ptr)
+{
+    iewStage = iew_ptr;
+
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setIEW(iew_ptr);
+    }
+}
+
+#if 0
+template<class Impl>
+void
+LSQ<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    for (int tid=0; tid < numThreads; tid++) {
+        thread[tid].setPageTable(pt_ptr);
+    }
+}
+#endif
+
+template <class Impl>
+int
+LSQ<Impl>::entryAmount(int num_threads)
+{
+    if (lsqPolicy == Partitioned) {
+        return LQEntries / num_threads;
+    } else {
+        return 0;
+    }
+}
+
+template <class Impl>
+void
+LSQ<Impl>::resetEntries()
+{
+    if (lsqPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        int maxEntries;
+
+        if (lsqPolicy == Partitioned) {
+            maxEntries = LQEntries / active_threads;
+        } else if (lsqPolicy == Threshold && active_threads == 1) {
+            maxEntries = LQEntries;
+        } else {
+            maxEntries = LQEntries;
+        }
+
+        while (threads != list_end) {
+            resizeEntries(maxEntries,*threads++);
+        }
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::removeEntries(unsigned tid)
+{
+    thread[tid].clearLQ();
+    thread[tid].clearSQ();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::resizeEntries(unsigned size,unsigned tid)
+{
+    thread[tid].resizeLQ(size);
+    thread[tid].resizeSQ(size);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::tick()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+
+        thread[tid].tick();
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::tick(unsigned tid)
+{
+    thread[tid].tick();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    unsigned tid = load_inst->threadNumber;
+
+    thread[tid].insertLoad(load_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    unsigned tid = store_inst->threadNumber;
+
+    thread[tid].insertStore(store_inst);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeLoad(DynInstPtr &inst)
+{
+    unsigned tid = inst->threadNumber;
+
+    return thread[tid].executeLoad(inst);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeLoad(int lq_idx, unsigned tid)
+{
+    return thread[tid].executeLoad(lq_idx);
+}
+
+template<class Impl>
+Fault
+LSQ<Impl>::executeStore(DynInstPtr &inst)
+{
+    unsigned tid = inst->threadNumber;
+
+    return thread[tid].executeStore(inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst,unsigned tid)
+{
+    thread[tid].commitLoads(youngest_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::commitStores(InstSeqNum &youngest_inst,unsigned tid)
+{
+    thread[tid].commitStores(youngest_inst);
+}
+
+template<class Impl>
+void
+LSQ<Impl>::writebackStores()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+
+        if (numStoresToWB(tid) > 0) {
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores available"
+                " for Writeback.\n", tid, numStoresToWB(tid));
+        }
+
+        thread[tid].writebackStores();
+    }
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStoresToWB(unsigned tid)
+{
+    return thread[tid].numStoresToWB();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::squash(const InstSeqNum &squashed_num, unsigned tid)
+{
+        thread[tid].squash(squashed_num);
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::violation()
+{
+    /* Answers: Does Anybody Have a Violation?*/
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (thread[tid].violation())
+            return true;
+    }
+
+    return false;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::violation(unsigned tid)
+{
+    return thread[tid].violation();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::loadBlocked(unsigned tid)
+{
+    return thread[tid].loadBlocked();
+}
+
+template<class Impl>
+typename Impl::DynInstPtr
+LSQ<Impl>::getMemDepViolator(unsigned tid)
+{
+    return thread[tid].getMemDepViolator();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getLoadHead(unsigned tid)
+{
+    return thread[tid].getLoadHead();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getStoreHead(unsigned tid)
+{
+    return thread[tid].getStoreHead();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getCount()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += getCount(tid);
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::getCount(unsigned tid)
+{
+    return thread[tid].getCount();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoads()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += numLoads(tid);
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoads(unsigned tid)
+{
+    return thread[tid].numLoads();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStores()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numStores();
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numStores(unsigned tid)
+{
+    return thread[tid].numStores();
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoadsReady()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numLoadsReady();
+    }
+
+    return total;
+}
+
+template<class Impl>
+int
+LSQ<Impl>::numLoadsReady(unsigned tid)
+{
+    return thread[tid].numLoadsReady();
+}
+
+template<class Impl>
+unsigned
+LSQ<Impl>::numFreeEntries()
+{
+    unsigned total = 0;
+
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        total += thread[tid].numFreeEntries();
+    }
+
+    return total;
+}
+
+template<class Impl>
+unsigned
+LSQ<Impl>::numFreeEntries(unsigned tid)
+{
+    //if( lsqPolicy == Dynamic )
+    //return numFreeEntries();
+    //else
+        return thread[tid].numFreeEntries();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (! (thread[tid].lqFull() || thread[tid].sqFull()) )
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isFull(unsigned tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return isFull();
+    else
+        return thread[tid].lqFull() || thread[tid].sqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::lqFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!thread[tid].lqFull())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::lqFull(unsigned tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return lqFull();
+    else
+        return thread[tid].lqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::sqFull()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!sqFull(tid))
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::sqFull(unsigned tid)
+{
+     //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if( lsqPolicy == Dynamic )
+        return sqFull();
+    else
+        return thread[tid].sqFull();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isStalled()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!thread[tid].isStalled())
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::isStalled(unsigned tid)
+{
+    if( lsqPolicy == Dynamic )
+        return isStalled();
+    else
+        return thread[tid].isStalled();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::hasStoresToWB()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!hasStoresToWB(tid))
+            return false;
+    }
+
+    return true;
+}
+
+
+template<class Impl>
+bool
+LSQ<Impl>::hasStoresToWB(unsigned tid)
+{
+    return thread[tid].hasStoresToWB();
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::willWB()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        if (!willWB(tid))
+            return false;
+    }
+
+    return true;
+}
+
+template<class Impl>
+bool
+LSQ<Impl>::willWB(unsigned tid)
+{
+    return thread[tid].willWB();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::dumpInsts()
+{
+    list<unsigned>::iterator active_threads = (*activeThreads).begin();
+
+    while (active_threads != (*activeThreads).end()) {
+        unsigned tid = *active_threads++;
+        thread[tid].dumpInsts();
+    }
+}
+
+template<class Impl>
+void
+LSQ<Impl>::dumpInsts(unsigned tid)
+{
+    thread[tid].dumpInsts();
+}
diff --git a/cpu/o3/lsq_unit.cc b/cpu/o3/lsq_unit.cc
new file mode 100644
index 000000000..dd29007bc
--- /dev/null
+++ b/cpu/o3/lsq_unit.cc
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/alpha_dyn_inst.hh"
+#include "cpu/o3/alpha_cpu.hh"
+#include "cpu/o3/alpha_impl.hh"
+#include "cpu/o3/lsq_unit_impl.hh"
+
+// Force the instantiation of LDSTQ for all the implementations we care about.
+template class LSQUnit<AlphaSimpleImpl>;
+
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
new file mode 100644
index 000000000..73c485ce9
--- /dev/null
+++ b/cpu/o3/lsq_unit.hh
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_LSQ_UNIT_HH__
+#define __CPU_O3_LSQ_UNIT_HH__
+
+#include <map>
+#include <queue>
+#include <algorithm>
+
+#include "config/full_system.hh"
+#include "base/hashmap.hh"
+#include "cpu/inst_seq.hh"
+#include "mem/mem_interface.hh"
+//#include "mem/page_table.hh"
+#include "sim/sim_object.hh"
+#include "arch/faults.hh"
+
+/**
+ * Class that implements the actual LQ and SQ for each specific thread.
+ * Both are circular queues; load entries are freed upon committing, while
+ * store entries are freed once they writeback. The LSQUnit tracks if there
+ * are memory ordering violations, and also detects partial load to store
+ * forwarding cases (a store only has part of a load's data) that requires
+ * the load to wait until the store writes back. In the former case it
+ * holds onto the instruction until the dependence unit looks at it, and
+ * in the latter it stalls the LSQ until the store writes back. At that
+ * point the load is replayed.
+ */
+template <class Impl>
+class LSQUnit {
+  protected:
+    typedef TheISA::IntReg IntReg;
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+    typedef typename Impl::CPUPol::IEW IEW;
+    typedef typename Impl::CPUPol::IssueStruct IssueStruct;
+
+  private:
+    class StoreCompletionEvent : public Event {
+      public:
+        /** Constructs a store completion event. */
+        StoreCompletionEvent(int store_idx, Event *wb_event, LSQUnit *lsq_ptr);
+
+        /** Processes the store completion event. */
+        void process();
+
+        /** Returns the description of this event. */
+        const char *description();
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
+        /** The writeback event for the store.  Needed for store
+         * conditionals.
+         */
+        Event *wbEvent;
+        /** The pointer to the LSQ unit that issued the store. */
+        LSQUnit<Impl> *lsqPtr;
+    };
+
+    friend class StoreCompletionEvent;
+
+  public:
+    /** Constructs an LSQ unit. init() must be called prior to use. */
+    LSQUnit();
+
+    /** Initializes the LSQ unit with the specified number of entries. */
+    void init(Params *params, unsigned maxLQEntries,
+              unsigned maxSQEntries, unsigned id);
+
+    /** Returns the name of the LSQ unit. */
+    std::string name() const;
+
+    /** Sets the CPU pointer. */
+    void setCPU(FullCPU *cpu_ptr)
+    { cpu = cpu_ptr; }
+
+    /** Sets the IEW stage pointer. */
+    void setIEW(IEW *iew_ptr)
+    { iewStage = iew_ptr; }
+
+    /** Sets the page table pointer. */
+//    void setPageTable(PageTable *pt_ptr);
+
+    /** Ticks the LSQ unit, which in this case only resets the number of
+     * used cache ports.
+     * @todo: Move the number of used ports up to the LSQ level so it can
+     * be shared by all LSQ units.
+     */
+    void tick() { usedPorts = 0; }
+
+    /** Inserts an instruction. */
+    void insert(DynInstPtr &inst);
+    /** Inserts a load instruction. */
+    void insertLoad(DynInstPtr &load_inst);
+    /** Inserts a store instruction. */
+    void insertStore(DynInstPtr &store_inst);
+
+    /** Executes a load instruction. */
+    Fault executeLoad(DynInstPtr &inst);
+
+    Fault executeLoad(int lq_idx);
+    /** Executes a store instruction. */
+    Fault executeStore(DynInstPtr &inst);
+
+    /** Commits the head load. */
+    void commitLoad();
+    /** Commits a specific load, given by the sequence number. */
+    void commitLoad(InstSeqNum &inst);
+    /** Commits loads older than a specific sequence number. */
+    void commitLoads(InstSeqNum &youngest_inst);
+
+    /** Commits stores older than a specific sequence number. */
+    void commitStores(InstSeqNum &youngest_inst);
+
+    /** Writes back stores. */
+    void writebackStores();
+
+    // @todo: Include stats in the LSQ unit.
+    //void regStats();
+
+    /** Clears all the entries in the LQ. */
+    void clearLQ();
+
+    /** Clears all the entries in the SQ. */
+    void clearSQ();
+
+    /** Resizes the LQ to a given size. */
+    void resizeLQ(unsigned size);
+
+    /** Resizes the SQ to a given size. */
+    void resizeSQ(unsigned size);
+
+    /** Squashes all instructions younger than a specific sequence number. */
+    void squash(const InstSeqNum &squashed_num);
+
+    /** Returns if there is a memory ordering violation. Value is reset upon
+     * call to getMemDepViolator().
+     */
+    bool violation() { return memDepViolator; }
+
+    /** Returns the memory ordering violator. */
+    DynInstPtr getMemDepViolator();
+
+    /** Returns if a load became blocked due to the memory system.  It clears
+     *  the bool's value upon this being called.
+     */
+    bool loadBlocked()
+    { return isLoadBlocked; }
+
+    void clearLoadBlocked()
+    { isLoadBlocked = false; }
+
+    bool isLoadBlockedHandled()
+    { return loadBlockedHandled; }
+
+    void setLoadBlockedHandled()
+    { loadBlockedHandled = true; }
+
+    /** Returns the number of free entries (min of free LQ and SQ entries). */
+    unsigned numFreeEntries();
+
+    /** Returns the number of loads ready to execute. */
+    int numLoadsReady();
+
+    /** Returns the number of loads in the LQ. */
+    int numLoads() { return loads; }
+
+    /** Returns the number of stores in the SQ. */
+    int numStores() { return stores; }
+
+    /** Returns if either the LQ or SQ is full. */
+    bool isFull() { return lqFull() || sqFull(); }
+
+    /** Returns if the LQ is full. */
+    bool lqFull() { return loads >= (LQEntries - 1); }
+
+    /** Returns if the SQ is full. */
+    bool sqFull() { return stores >= (SQEntries - 1); }
+
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
+    /** Returns the number of instructions in the LSQ. */
+    unsigned getCount() { return loads + stores; }
+
+    /** Returns if there are any stores to writeback. */
+    bool hasStoresToWB() { return storesToWB; }
+
+    /** Returns the number of stores to writeback. */
+    int numStoresToWB() { return storesToWB; }
+
+    /** Returns if the LSQ unit will writeback on this cycle. */
+    bool willWB() { return storeQueue[storeWBIdx].canWB &&
+                        !storeQueue[storeWBIdx].completed &&
+                        !dcacheInterface->isBlocked(); }
+
+  private:
+    /** Completes the store at the specified index. */
+    void completeStore(int store_idx);
+
+    /** Increments the given store index (circular queue). */
+    inline void incrStIdx(int &store_idx);
+    /** Decrements the given store index (circular queue). */
+    inline void decrStIdx(int &store_idx);
+    /** Increments the given load index (circular queue). */
+    inline void incrLdIdx(int &load_idx);
+    /** Decrements the given load index (circular queue). */
+    inline void decrLdIdx(int &load_idx);
+
+  private:
+    /** Pointer to the CPU. */
+    FullCPU *cpu;
+
+    /** Pointer to the IEW stage. */
+    IEW *iewStage;
+
+    /** Pointer to the D-cache. */
+    MemInterface *dcacheInterface;
+
+    /** Pointer to the page table. */
+//    PageTable *pTable;
+
+  public:
+    struct SQEntry {
+        /** Constructs an empty store queue entry. */
+        SQEntry()
+            : inst(NULL), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** Constructs a store queue entry for a given instruction. */
+        SQEntry(DynInstPtr &_inst)
+            : inst(_inst), req(NULL), size(0), data(0),
+              canWB(0), committed(0), completed(0)
+        { }
+
+        /** The store instruction. */
+        DynInstPtr inst;
+        /** The memory request for the store. */
+        MemReqPtr req;
+        /** The size of the store. */
+        int size;
+        /** The store data. */
+        IntReg data;
+        /** Whether or not the store can writeback. */
+        bool canWB;
+        /** Whether or not the store is committed. */
+        bool committed;
+        /** Whether or not the store is completed. */
+        bool completed;
+    };
+
+    enum Status {
+        Running,
+        Idle,
+        DcacheMissStall,
+        DcacheMissSwitch
+    };
+
+  private:
+    /** The LSQUnit thread id. */
+    unsigned lsqID;
+
+    /** The status of the LSQ unit. */
+    Status _status;
+
+    /** The store queue. */
+    std::vector<SQEntry> storeQueue;
+
+    /** The load queue. */
+    std::vector<DynInstPtr> loadQueue;
+
+    // Consider making these 16 bits
+    /** The number of LQ entries. */
+    unsigned LQEntries;
+    /** The number of SQ entries. */
+    unsigned SQEntries;
+
+    /** The number of load instructions in the LQ. */
+    int loads;
+    /** The number of store instructions in the SQ (excludes those waiting to
+     * writeback).
+     */
+    int stores;
+    /** The number of store instructions in the SQ waiting to writeback. */
+    int storesToWB;
+
+    /** The index of the head instruction in the LQ. */
+    int loadHead;
+    /** The index of the tail instruction in the LQ. */
+    int loadTail;
+
+    /** The index of the head instruction in the SQ. */
+    int storeHead;
+    /** The index of the first instruction that is ready to be written back,
+     * and has not yet been written back.
+     */
+    int storeWBIdx;
+    /** The index of the tail instruction in the SQ. */
+    int storeTail;
+
+    /// @todo Consider moving to a more advanced model with write vs read ports
+    /** The number of cache ports available each cycle. */
+    int cachePorts;
+
+    /** The number of used cache ports in this cycle. */
+    int usedPorts;
+
+    //list<InstSeqNum> mshrSeqNums;
+
+     //Stats::Scalar<> dcacheStallCycles;
+    Counter lastDcacheStall;
+
+    /** Wire to read information from the issue stage time queue. */
+    typename TimeBuffer<IssueStruct>::wire fromIssue;
+
+    // Make these per thread?
+    /** Whether or not the LSQ is stalled. */
+    bool stalled;
+    /** The store that causes the stall due to partial store to load
+     * forwarding.
+     */
+    InstSeqNum stallingStoreIsn;
+    /** The index of the above store. */
+    int stallingLoadIdx;
+
+    /** Whether or not a load is blocked due to the memory system.  It is
+     *  cleared when this value is checked via loadBlocked().
+     */
+    bool isLoadBlocked;
+
+    bool loadBlockedHandled;
+
+    InstSeqNum blockedLoadSeqNum;
+
+    /** The oldest faulting load instruction. */
+    DynInstPtr loadFaultInst;
+    /** The oldest faulting store instruction. */
+    DynInstPtr storeFaultInst;
+
+    /** The oldest load that caused a memory ordering violation. */
+    DynInstPtr memDepViolator;
+
+    // Will also need how many read/write ports the Dcache has.  Or keep track
+    // of that in stage that is one level up, and only call executeLoad/Store
+    // the appropriate number of times.
+
+  public:
+    /** Executes the load at the given index. */
+    template <class T>
+    Fault read(MemReqPtr &req, T &data, int load_idx);
+
+    /** Executes the store at the given index. */
+    template <class T>
+    Fault write(MemReqPtr &req, T &data, int store_idx);
+
+    /** Returns the index of the head load instruction. */
+    int getLoadHead() { return loadHead; }
+    /** Returns the sequence number of the head load instruction. */
+    InstSeqNum getLoadHeadSeqNum()
+    {
+        if (loadQueue[loadHead]) {
+            return loadQueue[loadHead]->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns the index of the head store instruction. */
+    int getStoreHead() { return storeHead; }
+    /** Returns the sequence number of the head store instruction. */
+    InstSeqNum getStoreHeadSeqNum()
+    {
+        if (storeQueue[storeHead].inst) {
+            return storeQueue[storeHead].inst->seqNum;
+        } else {
+            return 0;
+        }
+
+    }
+
+    /** Returns whether or not the LSQ unit is stalled. */
+    bool isStalled()  { return stalled; }
+};
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
+{
+    //Depending on issue2execute delay a squashed load could
+    //execute if it is found to be squashed in the same
+    //cycle it is scheduled to execute
+    assert(loadQueue[load_idx]);
+
+    if (loadQueue[load_idx]->isExecuted()) {
+        panic("Should not reach this point with split ops!");
+        memcpy(&data,req->data,req->size);
+
+        return NoFault;
+    }
+
+    // Make sure this isn't an uncacheable access
+    // A bit of a hackish way to get uncached accesses to work only if they're
+    // at the head of the LSQ and are ready to commit (at the head of the ROB
+    // too).
+    // @todo: Fix uncached accesses.
+    if (req->flags & UNCACHEABLE &&
+        (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
+        iewStage->rescheduleMemInst(loadQueue[load_idx]);
+        return TheISA::genMachineCheckFault();
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    int store_idx = loadQueue[load_idx]->sqIdx;
+
+    int store_size = 0;
+
+    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
+            "storeHead: %i addr: %#x\n",
+            load_idx, store_idx, storeHead, req->paddr);
+
+#ifdef FULL_SYSTEM
+    if (req->flags & LOCKED) {
+        cpu->lockAddr = req->paddr;
+        cpu->lockFlag = true;
+    }
+#endif
+
+    while (store_idx != -1) {
+        // End once we've reached the top of the LSQ
+        if (store_idx == storeWBIdx) {
+            break;
+        }
+
+        // Move the index to one younger
+        if (--store_idx < 0)
+            store_idx += SQEntries;
+
+        assert(storeQueue[store_idx].inst);
+
+        store_size = storeQueue[store_idx].size;
+
+        if (store_size == 0)
+            continue;
+
+        // Check if the store data is within the lower and upper bounds of
+        // addresses that the request needs.
+        bool store_has_lower_limit =
+            req->vaddr >= storeQueue[store_idx].inst->effAddr;
+        bool store_has_upper_limit =
+            (req->vaddr + req->size) <= (storeQueue[store_idx].inst->effAddr +
+                                         store_size);
+        bool lower_load_has_store_part =
+            req->vaddr < (storeQueue[store_idx].inst->effAddr +
+                           store_size);
+        bool upper_load_has_store_part =
+            (req->vaddr + req->size) > storeQueue[store_idx].inst->effAddr;
+
+        // If the store's data has all of the data needed, we can forward.
+        if (store_has_lower_limit && store_has_upper_limit) {
+
+            int shift_amt = req->vaddr & (store_size - 1);
+            // Assumes byte addressing
+            shift_amt = shift_amt << 3;
+
+            // Cast this to type T?
+            data = storeQueue[store_idx].data >> shift_amt;
+
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
+            assert(!req->data);
+            req->data = new uint8_t[64];
+
+            memcpy(req->data, &data, req->size);
+
+            DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                    "addr %#x, data %#x\n",
+                    store_idx, req->vaddr, *(req->data));
+
+            typename IEW::LdWritebackEvent *wb =
+                new typename IEW::LdWritebackEvent(loadQueue[load_idx],
+                                                   iewStage);
+
+            // We'll say this has a 1 cycle load-store forwarding latency
+            // for now.
+            // @todo: Need to make this a parameter.
+            wb->schedule(curTick);
+
+            // Should keep track of stat for forwarded data
+            return NoFault;
+        } else if ((store_has_lower_limit && lower_load_has_store_part) ||
+                   (store_has_upper_limit && upper_load_has_store_part) ||
+                   (lower_load_has_store_part && upper_load_has_store_part)) {
+            // This is the partial store-load forwarding case where a store
+            // has only part of the load's data.
+
+            // If it's already been written back, then don't worry about
+            // stalling on it.
+            if (storeQueue[store_idx].completed) {
+                continue;
+            }
+
+            // Must stall load and force it to retry, so long as it's the oldest
+            // load that needs to do so.
+            if (!stalled ||
+                (stalled &&
+                 loadQueue[load_idx]->seqNum <
+                 loadQueue[stallingLoadIdx]->seqNum)) {
+                stalled = true;
+                stallingStoreIsn = storeQueue[store_idx].inst->seqNum;
+                stallingLoadIdx = load_idx;
+            }
+
+            // Tell IQ/mem dep unit that this instruction will need to be
+            // rescheduled eventually
+            iewStage->rescheduleMemInst(loadQueue[load_idx]);
+
+            // Do not generate a writeback event as this instruction is not
+            // complete.
+
+            DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                    "Store idx %i to load addr %#x\n",
+                    store_idx, req->vaddr);
+
+            return NoFault;
+        }
+    }
+
+
+    // If there's no forwarding case, then go access memory
+    DynInstPtr inst = loadQueue[load_idx];
+
+    DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
+            loadQueue[load_idx]->readPC());
+    assert(!req->data);
+    req->data = new uint8_t[64];
+    Fault fault = cpu->read(req, data);
+    memcpy(req->data, &data, sizeof(T));
+
+    ++usedPorts;
+
+    // if we have a cache, do cache access too
+    if (fault == NoFault && dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            // There's an older load that's already going to squash.
+            if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
+                return NoFault;
+
+            isLoadBlocked = true;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = inst->seqNum;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+        DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
+                loadQueue[load_idx]->readPC());
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
+        MemAccessResult result = dcacheInterface->access(req);
+
+        assert(dcacheInterface->doEvents());
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        if (result != MA_HIT) {
+            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+                    inst->seqNum);
+
+            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
+        }
+    }
+#if 0
+    // if we have a cache, do cache access too
+    if (dcacheInterface) {
+        if (dcacheInterface->isBlocked()) {
+            isLoadBlocked = true;
+            // No fault occurred, even though the interface is blocked.
+            return NoFault;
+        }
+
+        DPRINTF(LSQUnit, "LSQUnit: D-cache: PC:%#x reading from paddr:%#x "
+                "vaddr:%#x flags:%i\n",
+                inst->readPC(), req->paddr, req->vaddr, req->flags);
+
+        // Setup MemReq pointer
+        req->cmd = Read;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+
+        assert(!req->completionEvent);
+        req->completionEvent =
+            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
+
+        // Do Cache Access
+        MemAccessResult result = dcacheInterface->access(req);
+
+        // Ugly hack to get an event scheduled *only* if the access is
+        // a miss.  We really should add first-class support for this
+        // at some point.
+        // @todo: Probably should support having no events
+        if (result != MA_HIT) {
+            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
+            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
+                    inst->seqNum);
+
+            lastDcacheStall = curTick;
+
+            _status = DcacheMissStall;
+
+        } else {
+            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
+                    inst->seqNum);
+
+            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
+        }
+    } else {
+        fatal("Must use D-cache with new memory system");
+    }
+#endif
+
+    return fault;
+}
+
+template <class Impl>
+template <class T>
+Fault
+LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x data %#x"
+            " | storeHead:%i [sn:%i]\n",
+            store_idx, req->paddr, data, storeHead,
+            storeQueue[store_idx].inst->seqNum);
+/*
+    if (req->flags & LOCKED) {
+        if (req->flags & UNCACHEABLE) {
+            req->result = 2;
+        } else {
+            req->result = 1;
+        }
+    }
+*/
+    storeQueue[store_idx].req = req;
+    storeQueue[store_idx].size = sizeof(T);
+    storeQueue[store_idx].data = data;
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
+#endif // __CPU_O3_LSQ_UNIT_HH__
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
new file mode 100644
index 000000000..d9a118b0e
--- /dev/null
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/lsq_unit.hh"
+#include "base/str.hh"
+
+template <class Impl>
+LSQUnit<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
+                                                          Event *wb_event,
+                                                          LSQUnit<Impl> *lsq_ptr)
+    : Event(&mainEventQueue),
+      storeIdx(store_idx),
+      wbEvent(wb_event),
+      lsqPtr(lsq_ptr)
+{
+    this->setFlags(Event::AutoDelete);
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::StoreCompletionEvent::process()
+{
+    DPRINTF(LSQ, "Cache miss complete for store idx:%i\n", storeIdx);
+    DPRINTF(Activity, "Activity: st writeback event idx:%i\n", storeIdx);
+
+    //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
+
+    lsqPtr->cpu->wakeCPU();
+    if (wbEvent)
+        wbEvent->process();
+    lsqPtr->completeStore(storeIdx);
+}
+
+template <class Impl>
+const char *
+LSQUnit<Impl>::StoreCompletionEvent::description()
+{
+    return "LSQ store completion event";
+}
+
+template <class Impl>
+LSQUnit<Impl>::LSQUnit()
+    : loads(0), stores(0), storesToWB(0), stalled(false), isLoadBlocked(false),
+      loadBlockedHandled(false)
+{
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
+                    unsigned maxSQEntries, unsigned id)
+
+{
+    DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id);
+
+    lsqID = id;
+
+    LQEntries = maxLQEntries;
+    SQEntries = maxSQEntries;
+
+    loadQueue.resize(LQEntries);
+    storeQueue.resize(SQEntries);
+
+
+    // May want to initialize these entries to NULL
+
+    loadHead = loadTail = 0;
+
+    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+    cachePorts = params->cachePorts;
+
+    dcacheInterface = params->dcacheInterface;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
+}
+
+template<class Impl>
+std::string
+LSQUnit<Impl>::name() const
+{
+    if (Impl::MaxThreads == 1) {
+        return iewStage->name() + ".lsq";
+    } else {
+        return iewStage->name() + ".lsq.thread." + to_string(lsqID);
+    }
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::clearLQ()
+{
+    loadQueue.clear();
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::clearSQ()
+{
+    storeQueue.clear();
+}
+
+#if 0
+template<class Impl>
+void
+LSQUnit<Impl>::setPageTable(PageTable *pt_ptr)
+{
+    DPRINTF(LSQUnit, "Setting the page table pointer.\n");
+    pTable = pt_ptr;
+}
+#endif
+
+template<class Impl>
+void
+LSQUnit<Impl>::resizeLQ(unsigned size)
+{
+    assert( size >= LQEntries);
+
+    if (size > LQEntries) {
+        while (size > loadQueue.size()) {
+            DynInstPtr dummy;
+            loadQueue.push_back(dummy);
+            LQEntries++;
+        }
+    } else {
+        LQEntries = size;
+    }
+
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::resizeSQ(unsigned size)
+{
+    if (size > SQEntries) {
+        while (size > storeQueue.size()) {
+            SQEntry dummy;
+            storeQueue.push_back(dummy);
+            SQEntries++;
+        }
+    } else {
+        SQEntries = size;
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insert(DynInstPtr &inst)
+{
+    // Make sure we really have a memory reference.
+    assert(inst->isMemRef());
+
+    // Make sure it's one of the two classes of memory references.
+    assert(inst->isLoad() || inst->isStore());
+
+    if (inst->isLoad()) {
+        insertLoad(inst);
+    } else {
+        insertStore(inst);
+    }
+
+    inst->setInLSQ();
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
+{
+    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+
+    DPRINTF(LSQUnit, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
+            load_inst->readPC(), loadTail, load_inst->seqNum);
+
+    load_inst->lqIdx = loadTail;
+
+    if (stores == 0) {
+        load_inst->sqIdx = -1;
+    } else {
+        load_inst->sqIdx = storeTail;
+    }
+
+    loadQueue[loadTail] = load_inst;
+
+    incrLdIdx(loadTail);
+
+    ++loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
+{
+    // Make sure it is not full before inserting an instruction.
+    assert((storeTail + 1) % SQEntries != storeHead);
+    assert(stores < SQEntries);
+
+    DPRINTF(LSQUnit, "Inserting store PC %#x, idx:%i [sn:%lli]\n",
+            store_inst->readPC(), storeTail, store_inst->seqNum);
+
+    store_inst->sqIdx = storeTail;
+    store_inst->lqIdx = loadTail;
+
+    storeQueue[storeTail] = SQEntry(store_inst);
+
+    incrStIdx(storeTail);
+
+    ++stores;
+
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+LSQUnit<Impl>::getMemDepViolator()
+{
+    DynInstPtr temp = memDepViolator;
+
+    memDepViolator = NULL;
+
+    return temp;
+}
+
+template <class Impl>
+unsigned
+LSQUnit<Impl>::numFreeEntries()
+{
+    unsigned free_lq_entries = LQEntries - loads;
+    unsigned free_sq_entries = SQEntries - stores;
+
+    // Both the LQ and SQ entries have an extra dummy entry to differentiate
+    // empty/full conditions.  Subtract 1 from the free entries.
+    if (free_lq_entries < free_sq_entries) {
+        return free_lq_entries - 1;
+    } else {
+        return free_sq_entries - 1;
+    }
+}
+
+template <class Impl>
+int
+LSQUnit<Impl>::numLoadsReady()
+{
+    int load_idx = loadHead;
+    int retval = 0;
+
+    while (load_idx != loadTail) {
+        assert(loadQueue[load_idx]);
+
+        if (loadQueue[load_idx]->readyToIssue()) {
+            ++retval;
+        }
+    }
+
+    return retval;
+}
+
+#if 0
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad()
+{
+    Fault load_fault = NoFault;
+    DynInstPtr load_inst;
+
+    assert(readyLoads.size() != 0);
+
+    // Execute a ready load.
+    LdMapIt ready_it = readyLoads.begin();
+
+    load_inst = (*ready_it).second;
+
+    // Execute the instruction, which is held in the data portion of the
+    // iterator.
+    load_fault = load_inst->execute();
+
+    // If it executed successfully, then switch it over to the executed
+    // loads list.
+    if (load_fault == NoFault) {
+        executedLoads[load_inst->seqNum] = load_inst;
+
+        readyLoads.erase(ready_it);
+    } else {
+        loadFaultInst = load_inst;
+    }
+
+    return load_fault;
+}
+#endif
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
+{
+    // Execute a specific load.
+    Fault load_fault = NoFault;
+
+    DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
+            inst->readPC(),inst->seqNum);
+
+    // Make sure it's really in the list.
+    // Normally it should always be in the list.  However,
+    /* due to a syscall it may not be the list.
+#ifdef DEBUG
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail && !find(inst)) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i] == inst) {
+            break;
+        }
+
+        i = i + 1;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+#endif // DEBUG*/
+
+//    load_fault = inst->initiateAcc();
+    load_fault = inst->execute();
+
+    // If the instruction faulted, then we need to send it along to commit
+    // without the instruction completing.
+    if (load_fault != NoFault) {
+        // Maybe just set it as can commit here, although that might cause
+        // some other problems with sending traps to the ROB too quickly.
+        iewStage->instToCommit(inst);
+        iewStage->activityThisCycle();
+    }
+
+    return load_fault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeLoad(int lq_idx)
+{
+    // Very hackish.  Not sure the best way to check that this
+    // instruction is at the head of the ROB.  I should have some sort
+    // of extra information here so that I'm not overloading the
+    // canCommit signal for 15 different things.
+    loadQueue[lq_idx]->setCanCommit();
+    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
+    loadQueue[lq_idx]->clearCanCommit();
+    return ret_fault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
+{
+    using namespace TheISA;
+    // Make sure that a store exists.
+    assert(stores != 0);
+
+    int store_idx = store_inst->sqIdx;
+
+    DPRINTF(LSQUnit, "Executing store PC %#x [sn:%lli]\n",
+            store_inst->readPC(), store_inst->seqNum);
+
+    // Check the recently completed loads to see if any match this store's
+    // address.  If so, then we have a memory ordering violation.
+    int load_idx = store_inst->lqIdx;
+
+    Fault store_fault = store_inst->initiateAcc();
+//    Fault store_fault = store_inst->execute();
+
+    // Store size should now be available.  Use it to get proper offset for
+    // addr comparisons.
+    int size = storeQueue[store_idx].size;
+
+    if (size == 0) {
+        DPRINTF(LSQUnit,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
+                store_inst->readPC(),store_inst->seqNum);
+
+        return store_fault;
+    }
+
+    assert(store_fault == NoFault);
+
+    if (!storeFaultInst) {
+        if (store_fault != NoFault) {
+            panic("Fault in a store instruction!");
+            storeFaultInst = store_inst;
+        } else if (store_inst->isNonSpeculative()) {
+            // Nonspeculative accesses (namely store conditionals)
+            // need to set themselves as able to writeback if we
+            // haven't had a fault by here.
+            storeQueue[store_idx].canWB = true;
+
+            ++storesToWB;
+        }
+    }
+
+    if (!memDepViolator) {
+        while (load_idx != loadTail) {
+            // Actually should only check loads that have actually executed
+            // Might be safe because effAddr is set to InvalAddr when the
+            // dyn inst is created.
+
+            // Must actually check all addrs in the proper size range
+            // Which is more correct than needs to be.  What if for now we just
+            // assume all loads are quad-word loads, and do the addr based
+            // on that.
+            // @todo: Fix this, magic number being used here
+            if ((loadQueue[load_idx]->effAddr >> 8) ==
+                (store_inst->effAddr >> 8)) {
+                // A load incorrectly passed this store.  Squash and refetch.
+                // For now return a fault to show that it was unsuccessful.
+                memDepViolator = loadQueue[load_idx];
+
+                return genMachineCheckFault();
+            }
+
+            incrLdIdx(load_idx);
+        }
+
+        // If we've reached this point, there was no violation.
+        memDepViolator = NULL;
+    }
+
+    return store_fault;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoad()
+{
+    assert(loadQueue[loadHead]);
+
+    DPRINTF(LSQUnit, "Committing head load instruction, PC %#x\n",
+            loadQueue[loadHead]->readPC());
+
+
+    loadQueue[loadHead] = NULL;
+
+    incrLdIdx(loadHead);
+
+    --loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoad(InstSeqNum &inst)
+{
+    // Hopefully I don't use this function too much
+    panic("Don't use this function!");
+
+    int i = loadHead;
+    while (1) {
+        if (i == loadTail) {
+            assert(0 && "Load not in the queue!");
+        } else if (loadQueue[i]->seqNum == inst) {
+            break;
+        }
+
+        ++i;
+        if (i >= LQEntries) {
+            i = 0;
+        }
+    }
+
+    loadQueue[i]->removeInLSQ();
+    loadQueue[i] = NULL;
+    --loads;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
+{
+    assert(loads == 0 || loadQueue[loadHead]);
+
+    while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) {
+        commitLoad();
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
+{
+    assert(stores == 0 || storeQueue[storeHead].inst);
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail) {
+        assert(storeQueue[store_idx].inst);
+        if (!storeQueue[store_idx].canWB) {
+            if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
+                break;
+            }
+            DPRINTF(LSQUnit, "Marking store as able to write back, PC "
+                    "%#x [sn:%lli]\n",
+                    storeQueue[store_idx].inst->readPC(),
+                    storeQueue[store_idx].inst->seqNum);
+
+            storeQueue[store_idx].canWB = true;
+
+//            --stores;
+            ++storesToWB;
+        }
+
+        incrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::writebackStores()
+{
+    while (storesToWB > 0 &&
+           storeWBIdx != storeTail &&
+           storeQueue[storeWBIdx].inst &&
+           storeQueue[storeWBIdx].canWB &&
+           usedPorts < cachePorts) {
+
+        if (storeQueue[storeWBIdx].size == 0) {
+            completeStore(storeWBIdx);
+
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        if (dcacheInterface && dcacheInterface->isBlocked()) {
+            DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
+                    " is blocked!\n");
+            break;
+        }
+
+        ++usedPorts;
+
+        if (storeQueue[storeWBIdx].inst->isDataPrefetch()) {
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        assert(storeQueue[storeWBIdx].req);
+        assert(!storeQueue[storeWBIdx].committed);
+
+        MemReqPtr req = storeQueue[storeWBIdx].req;
+        storeQueue[storeWBIdx].committed = true;
+
+//	Fault fault = cpu->translateDataWriteReq(req);
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size);
+
+        DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                storeWBIdx,storeQueue[storeWBIdx].inst->readPC(),
+                req->paddr, *(req->data),
+                storeQueue[storeWBIdx].inst->seqNum);
+
+//        if (fault != NoFault) {
+            //What should we do if there is a fault???
+            //for now panic
+//            panic("Page Table Fault!!!!!\n");
+//        }
+        switch(storeQueue[storeWBIdx].size) {
+          case 1:
+            cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)storeQueue[storeWBIdx].data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+
+        if (dcacheInterface) {
+            MemAccessResult result = dcacheInterface->access(req);
+
+            if (isStalled() &&
+                storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) {
+                DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
+                        "load idx:%i\n",
+                        stallingStoreIsn, stallingLoadIdx);
+                stalled = false;
+                stallingStoreIsn = 0;
+                iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+            }
+
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                typename IEW::LdWritebackEvent *wb = NULL;
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+/*
+                    if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
+                        req->result=1;
+                    } else {
+                        req->result = 0;
+                    }
+*/
+                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                            iewStage);
+                }
+
+                DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
+
+                DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
+                        storeQueue[storeWBIdx].inst->seqNum);
+
+                // Will stores need their own kind of writeback events?
+                // Do stores even need writeback events?
+                assert(!req->completionEvent);
+                req->completionEvent = new
+                    StoreCompletionEvent(storeWBIdx, wb, this);
+
+                lastDcacheStall = curTick;
+
+                _status = DcacheMissStall;
+
+                //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
+
+                //DPRINTF(LSQUnit, "Added MSHR. count = %i\n",mshrSeqNums.size());
+
+                // Increment stat here or something
+            } else {
+                DPRINTF(LSQUnit,"D-Cache: Write Hit on idx:%i !\n",
+                        storeWBIdx);
+
+                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
+                        storeQueue[storeWBIdx].inst->seqNum);
+
+
+                if (req->flags & LOCKED) {
+                    // Stx_C does not generate a system port transaction.
+/*
+                    if (req->flags & UNCACHEABLE) {
+                        req->result = 2;
+                    } else {
+                        if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
+                            req->result=1;
+                        } else {
+                            req->result = 0;
+                        }
+                    }
+*/
+                    typename IEW::LdWritebackEvent *wb =
+                        new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                                           iewStage);
+                    wb->schedule(curTick);
+                }
+
+                completeStore(storeWBIdx);
+            }
+
+            incrStIdx(storeWBIdx);
+        } else {
+            panic("Must HAVE DCACHE!!!!!\n");
+        }
+    }
+
+    // Not sure this should set it to 0.
+    usedPorts = 0;
+
+    assert(stores >= 0 && storesToWB >= 0);
+}
+
+/*template <class Impl>
+void
+LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum)
+{
+    list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(),
+                                              mshrSeqNums.end(),
+                                              seqNum);
+
+    if (mshr_it != mshrSeqNums.end()) {
+        mshrSeqNums.erase(mshr_it);
+        DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size());
+    }
+}*/
+
+template <class Impl>
+void
+LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
+{
+    DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
+            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+
+    int load_idx = loadTail;
+    decrLdIdx(load_idx);
+
+    while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(LSQUnit,"Load Instruction PC %#x squashed, "
+                "[sn:%lli]\n",
+                loadQueue[load_idx]->readPC(),
+                loadQueue[load_idx]->seqNum);
+
+        if (isStalled() && load_idx == stallingLoadIdx) {
+            stalled = false;
+            stallingStoreIsn = 0;
+            stallingLoadIdx = 0;
+        }
+
+        loadQueue[load_idx]->squashed = true;
+        loadQueue[load_idx] = NULL;
+        --loads;
+
+        // Inefficient!
+        loadTail = load_idx;
+
+        decrLdIdx(load_idx);
+    }
+
+    if (isLoadBlocked) {
+        if (squashed_num < blockedLoadSeqNum) {
+            isLoadBlocked = false;
+            loadBlockedHandled = false;
+            blockedLoadSeqNum = 0;
+        }
+    }
+
+    int store_idx = storeTail;
+    decrStIdx(store_idx);
+
+    while (stores != 0 &&
+           storeQueue[store_idx].inst->seqNum > squashed_num) {
+
+        if (storeQueue[store_idx].canWB) {
+            break;
+        }
+
+        // Clear the smart pointer to make sure it is decremented.
+        DPRINTF(LSQUnit,"Store Instruction PC %#x squashed, "
+                "idx:%i [sn:%lli]\n",
+                storeQueue[store_idx].inst->readPC(),
+                store_idx, storeQueue[store_idx].inst->seqNum);
+
+        // I don't think this can happen.  It should have been cleared by the
+        // stalling load.
+        if (isStalled() &&
+            storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+            panic("Is stalled should have been cleared by stalling load!\n");
+            stalled = false;
+            stallingStoreIsn = 0;
+        }
+
+        storeQueue[store_idx].inst->squashed = true;
+        storeQueue[store_idx].inst = NULL;
+        storeQueue[store_idx].canWB = 0;
+
+        if (storeQueue[store_idx].req) {
+            assert(!storeQueue[store_idx].req->completionEvent);
+        }
+        storeQueue[store_idx].req = NULL;
+        --stores;
+
+        // Inefficient!
+        storeTail = store_idx;
+
+        decrStIdx(store_idx);
+    }
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("%#x ", loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("Store queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::completeStore(int store_idx)
+{
+    assert(storeQueue[store_idx].inst);
+    storeQueue[store_idx].completed = true;
+    --storesToWB;
+    // A bit conservative because a store completion may not free up entries,
+    // but hopefully avoids two store completions in one cycle from making
+    // the CPU tick twice.
+    cpu->activityThisCycle();
+
+    if (store_idx == storeHead) {
+        do {
+            incrStIdx(storeHead);
+
+            --stores;
+        } while (storeQueue[storeHead].completed &&
+                 storeHead != storeTail);
+
+        iewStage->updateLSQNextCycle = true;
+    }
+
+    DPRINTF(LSQUnit, "Store head idx:%i\n", storeHead);
+
+    if (isStalled() &&
+        storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
+        DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] "
+                "load idx:%i\n",
+                stallingStoreIsn, stallingLoadIdx);
+        stalled = false;
+        stallingStoreIsn = 0;
+        iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
+    }
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::incrStIdx(int &store_idx)
+{
+    if (++store_idx >= SQEntries)
+        store_idx = 0;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::decrStIdx(int &store_idx)
+{
+    if (--store_idx < 0)
+        store_idx += SQEntries;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::incrLdIdx(int &load_idx)
+{
+    if (++load_idx >= LQEntries)
+        load_idx = 0;
+}
+
+template <class Impl>
+inline void
+LSQUnit<Impl>::decrLdIdx(int &load_idx)
+{
+    if (--load_idx < 0)
+        load_idx += LQEntries;
+}
diff --git a/cpu/o3/mem_dep_unit.cc b/cpu/o3/mem_dep_unit.cc
index 9c1e7f9d8..ccdd1a515 100644
--- a/cpu/o3/mem_dep_unit.cc
+++ b/cpu/o3/mem_dep_unit.cc
@@ -34,3 +34,13 @@
 // Force instantation of memory dependency unit using store sets and
 // AlphaSimpleImpl.
 template class MemDepUnit<StoreSet, AlphaSimpleImpl>;
+
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_count = 0;
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_insert = 0;
+template <>
+int
+MemDepUnit<StoreSet, AlphaSimpleImpl>::MemDepEntry::memdep_erase = 0;
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index ca63577a1..32ce9f768 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -26,15 +26,29 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_MEM_DEP_UNIT_HH__
-#define __CPU_O3_CPU_MEM_DEP_UNIT_HH__
+#ifndef __CPU_O3_MEM_DEP_UNIT_HH__
+#define __CPU_O3_MEM_DEP_UNIT_HH__
 
-#include <map>
+#include <list>
 #include <set>
 
+#include "base/hashmap.hh"
+#include "base/refcnt.hh"
 #include "base/statistics.hh"
 #include "cpu/inst_seq.hh"
 
+struct SNHash {
+    size_t operator() (const InstSeqNum &seq_num) const {
+        unsigned a = (unsigned)seq_num;
+        unsigned hash = (((a >> 14) ^ ((a >> 2) & 0xffff))) & 0x7FFFFFFF;
+
+        return hash;
+    }
+};
+
+template <class Impl>
+class InstructionQueue;
+
 /**
  * Memory dependency unit class.  This holds the memory dependence predictor.
  * As memory operations are issued to the IQ, they are also issued to this
@@ -52,101 +66,162 @@ class MemDepUnit {
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-  public:
-    MemDepUnit(Params &params);
+    /** Empty constructor. Must call init() prior to using in this case. */
+    MemDepUnit() {}
+
+    /** Constructs a MemDepUnit with given parameters. */
+    MemDepUnit(Params *params);
 
+    /** Frees up any memory allocated. */
+    ~MemDepUnit();
+
+    /** Returns the name of the memory dependence unit. */
+    std::string name() const;
+
+    /** Initializes the unit with parameters and a thread id. */
+    void init(Params *params, int tid);
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets the pointer to the IQ. */
+    void setIQ(InstructionQueue<Impl> *iq_ptr);
+
+    /** Inserts a memory instruction. */
     void insert(DynInstPtr &inst);
 
+    /** Inserts a non-speculative memory instruction. */
     void insertNonSpec(DynInstPtr &inst);
 
-    // Will want to make this operation relatively fast.  Right now it
-    // is somewhat slow.
-    DynInstPtr &top();
-
-    void pop();
+    /** Inserts a barrier instruction. */
+    void insertBarrier(DynInstPtr &barr_inst);
 
+    /** Indicate that an instruction has its registers ready. */
     void regsReady(DynInstPtr &inst);
 
+    /** Indicate that a non-speculative instruction is ready. */
     void nonSpecInstReady(DynInstPtr &inst);
 
-    void issue(DynInstPtr &inst);
+    /** Reschedules an instruction to be re-executed. */
+    void reschedule(DynInstPtr &inst);
+
+    /** Replays all instructions that have been rescheduled by moving them to
+     *  the ready list.
+     */
+    void replay(DynInstPtr &inst);
 
+    /** Completes a memory instruction. */
+    void completed(DynInstPtr &inst);
+
+    /** Completes a barrier instruction. */
+    void completeBarrier(DynInstPtr &inst);
+
+    /** Wakes any dependents of a memory instruction. */
     void wakeDependents(DynInstPtr &inst);
 
-    void squash(const InstSeqNum &squashed_num);
+    /** Squashes all instructions up until a given sequence number for a
+     *  specific thread.
+     */
+    void squash(const InstSeqNum &squashed_num, unsigned tid);
 
+    /** Indicates an ordering violation between a store and a younger load. */
     void violation(DynInstPtr &store_inst, DynInstPtr &violating_load);
 
-    inline bool empty()
-    { return readyInsts.empty(); }
+    /** Issues the given instruction */
+    void issue(DynInstPtr &inst);
+
+    /** Debugging function to dump the lists of instructions. */
+    void dumpLists();
 
   private:
-    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator dyn_it_t;
-
-    // Forward declarations so that the following two typedefs work.
-    class Dependency;
-    class ltDependency;
-
-    typedef typename std::set<Dependency, ltDependency>::iterator dep_it_t;
-    typedef typename std::map<InstSeqNum, vector<dep_it_t> >::iterator
-    sd_it_t;
-
-    struct Dependency {
-        Dependency(const InstSeqNum &_seqNum)
-            : seqNum(_seqNum), regsReady(0), memDepReady(0)
-        { }
-
-        Dependency(const InstSeqNum &_seqNum, bool _regsReady,
-                   bool _memDepReady)
-            : seqNum(_seqNum), regsReady(_regsReady),
-              memDepReady(_memDepReady)
-        { }
-
-        InstSeqNum seqNum;
-        mutable bool regsReady;
-        mutable bool memDepReady;
-        mutable sd_it_t storeDep;
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
+
+    class MemDepEntry;
+
+    typedef RefCountingPtr<MemDepEntry> MemDepEntryPtr;
+
+    /** Memory dependence entries that track memory operations, marking
+     *  when the instruction is ready to execute and what instructions depend
+     *  upon it.
+     */
+    class MemDepEntry : public RefCounted {
+      public:
+        /** Constructs a memory dependence entry. */
+        MemDepEntry(DynInstPtr &new_inst)
+            : inst(new_inst), regsReady(false), memDepReady(false),
+              completed(false), squashed(false)
+        {
+            ++memdep_count;
+
+            DPRINTF(MemDepUnit, "Memory dependency entry created.  "
+                    "memdep_count=%i\n", memdep_count);
+        }
+
+        /** Frees any pointers. */
+        ~MemDepEntry()
+        {
+            for (int i = 0; i < dependInsts.size(); ++i) {
+                dependInsts[i] = NULL;
+            }
+
+            --memdep_count;
+
+            DPRINTF(MemDepUnit, "Memory dependency entry deleted.  "
+                    "memdep_count=%i\n", memdep_count);
+        }
+
+        /** Returns the name of the memory dependence entry. */
+        std::string name() const { return "memdepentry"; }
+
+        /** The instruction being tracked. */
+        DynInstPtr inst;
+
+        /** The iterator to the instruction's location inside the list. */
+        ListIt listIt;
+
+        /** A vector of any dependent instructions. */
+        std::vector<MemDepEntryPtr> dependInsts;
+
+        /** If the registers are ready or not. */
+        bool regsReady;
+        /** If all memory dependencies have been satisfied. */
+        bool memDepReady;
+        /** If the instruction is completed. */
+        bool completed;
+        /** If the instruction is squashed. */
+        bool squashed;
+
+        /** For debugging. */
+        static int memdep_count;
+        static int memdep_insert;
+        static int memdep_erase;
     };
 
-    struct ltDependency {
-        bool operator() (const Dependency &lhs, const Dependency &rhs)
+    struct ltMemDepEntry {
+        bool operator() (const MemDepEntryPtr &lhs, const MemDepEntryPtr &rhs)
         {
-            return lhs.seqNum < rhs.seqNum;
+            return lhs->inst->seqNum < rhs->inst->seqNum;
         }
     };
 
-    inline void moveToReady(dep_it_t &woken_inst);
+    /** Finds the memory dependence entry in the hash map. */
+    inline MemDepEntryPtr &findInHash(const DynInstPtr &inst);
 
-    /** List of instructions that have passed through rename, yet are still
-     *  waiting on either a memory dependence to resolve or source registers to
-     *  become available before they can issue.
-     */
-    std::set<Dependency, ltDependency> waitingInsts;
+    /** Moves an entry to the ready list. */
+    inline void moveToReady(MemDepEntryPtr &ready_inst_entry);
 
-    /** List of instructions that have all their predicted memory dependences
-     *  resolved and their source registers ready.
-     */
-    std::set<InstSeqNum> readyInsts;
+    typedef m5::hash_map<InstSeqNum, MemDepEntryPtr, SNHash> MemDepHash;
 
-    // Change this to hold a vector of iterators, which will point to the
-    // entry of the waiting instructions.
-    /** List of stores' sequence numbers, each of which has a vector of
-     *  iterators.  The iterators point to the appropriate node within
-     *  waitingInsts that has the depenendent instruction.
-     */
-    std::map<InstSeqNum, vector<dep_it_t> > storeDependents;
+    typedef typename MemDepHash::iterator MemDepHashIt;
+
+    /** A hash map of all memory dependence entries. */
+    MemDepHash memDepHash;
 
-    // For now will implement this as a map...hash table might not be too
-    // bad, or could move to something that mimics the current dependency
-    // graph.
-    std::map<InstSeqNum, DynInstPtr> memInsts;
+    /** A list of all instructions in the memory dependence unit. */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
 
-    // Iterator pointer to the top instruction which has is ready.
-    // Is set by the top() call.
-    dyn_it_t topInst;
+    /** A list of all instructions that are going to be replayed. */
+    std::list<DynInstPtr> instsToReplay;
 
     /** The memory dependence predictor.  It is accessed upon new
      *  instructions being added to the IQ, and responds by telling
@@ -155,10 +230,25 @@ class MemDepUnit {
      */
     MemDepPred depPred;
 
+    bool loadBarrier;
+    InstSeqNum loadBarrierSN;
+    bool storeBarrier;
+    InstSeqNum storeBarrierSN;
+
+    /** Pointer to the IQ. */
+    InstructionQueue<Impl> *iqPtr;
+
+    /** The thread id of this memory dependence unit. */
+    int id;
+
+    /** Stat for number of inserted loads. */
     Stats::Scalar<> insertedLoads;
+    /** Stat for number of inserted stores. */
     Stats::Scalar<> insertedStores;
+    /** Stat for number of conflicting loads that had to wait for a store. */
     Stats::Scalar<> conflictingLoads;
+    /** Stat for number of conflicting stores that had to wait for a store. */
     Stats::Scalar<> conflictingStores;
 };
 
-#endif // __CPU_O3_CPU_MEM_DEP_UNIT_HH__
+#endif // __CPU_O3_MEM_DEP_UNIT_HH__
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 296db4c4e..771a0505e 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -28,13 +28,56 @@
 
 #include <map>
 
+#include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 
 template <class MemDepPred, class Impl>
-MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params &params)
-    : depPred(params.SSITSize, params.LFSTSize)
+MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params *params)
+    : depPred(params->SSITSize, params->LFSTSize), loadBarrier(false),
+      loadBarrierSN(0), storeBarrier(false), storeBarrierSN(0), iqPtr(NULL)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n");
+    DPRINTF(MemDepUnit, "Creating MemDepUnit object.\n");
+}
+
+template <class MemDepPred, class Impl>
+MemDepUnit<MemDepPred, Impl>::~MemDepUnit()
+{
+    for (int tid=0; tid < Impl::MaxThreads; tid++) {
+
+        ListIt inst_list_it = instList[tid].begin();
+
+        MemDepHashIt hash_it;
+
+        while (!instList[tid].empty()) {
+            hash_it = memDepHash.find((*inst_list_it)->seqNum);
+
+            assert(hash_it != memDepHash.end());
+
+            memDepHash.erase(hash_it);
+
+            instList[tid].erase(inst_list_it++);
+        }
+    }
+
+    assert(MemDepEntry::memdep_count == 0);
+}
+
+template <class MemDepPred, class Impl>
+std::string
+MemDepUnit<MemDepPred, Impl>::name() const
+{
+    return "memdepunit";
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::init(Params *params, int tid)
+{
+    DPRINTF(MemDepUnit, "Creating MemDepUnit %i object.\n",tid);
+
+    id = tid;
+
+    depPred.init(params->SSITSize, params->LFSTSize);
 }
 
 template <class MemDepPred, class Impl>
@@ -58,58 +101,79 @@ MemDepUnit<MemDepPred, Impl>::regStats()
         .desc("Number of conflicting stores.");
 }
 
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)
+{
+    iqPtr = iq_ptr;
+}
+
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
 {
-    InstSeqNum inst_seq_num = inst->seqNum;
+    unsigned tid = inst->threadNumber;
+
+    MemDepEntryPtr inst_entry = new MemDepEntry(inst);
 
-    Dependency unresolved_dependencies(inst_seq_num);
+    // Add the MemDepEntry to the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+    MemDepEntry::memdep_insert++;
 
-    InstSeqNum producing_store = depPred.checkInst(inst->readPC());
+    // Add the instruction to the instruction list.
+    instList[tid].push_back(inst);
 
-    if (producing_store == 0 ||
-        storeDependents.find(producing_store) == storeDependents.end()) {
+    inst_entry->listIt = --(instList[tid].end());
 
-        DPRINTF(MemDepUnit, "MemDepUnit: No dependency for inst PC "
-                "%#x.\n", inst->readPC());
+    // Check the dependence predictor for any producing stores.
+    InstSeqNum producing_store;
+    if (inst->isLoad() && loadBarrier) {
+        producing_store = loadBarrierSN;
+    } else if (inst->isStore() && storeBarrier) {
+        producing_store = storeBarrierSN;
+    } else {
+        producing_store = depPred.checkInst(inst->readPC());
+    }
 
-        unresolved_dependencies.storeDep = storeDependents.end();
+    MemDepEntryPtr store_entry = NULL;
+
+    // If there is a producing store, try to find the entry.
+    if (producing_store != 0) {
+        MemDepHashIt hash_it = memDepHash.find(producing_store);
+
+        if (hash_it != memDepHash.end()) {
+            store_entry = (*hash_it).second;
+        }
+    }
+
+    // If no store entry, then instruction can issue as soon as the registers
+    // are ready.
+    if (!store_entry) {
+        DPRINTF(MemDepUnit, "No dependency for inst PC "
+                "%#x [sn:%lli].\n", inst->readPC(), inst->seqNum);
+
+        inst_entry->memDepReady = true;
 
         if (inst->readyToIssue()) {
-            readyInsts.insert(inst_seq_num);
-        } else {
-            unresolved_dependencies.memDepReady = true;
+            inst_entry->regsReady = true;
 
-            waitingInsts.insert(unresolved_dependencies);
+            moveToReady(inst_entry);
         }
     } else {
-        DPRINTF(MemDepUnit, "MemDepUnit: Adding to dependency list; "
-                "inst PC %#x is dependent on seq num %i.\n",
+        // Otherwise make the instruction dependent on the store.
+        DPRINTF(MemDepUnit, "Adding to dependency list; "
+                "inst PC %#x is dependent on [sn:%lli].\n",
                 inst->readPC(), producing_store);
 
         if (inst->readyToIssue()) {
-            unresolved_dependencies.regsReady = true;
+            inst_entry->regsReady = true;
         }
 
-        // Find the store that this instruction is dependent on.
-        sd_it_t store_loc = storeDependents.find(producing_store);
-
-        assert(store_loc != storeDependents.end());
-
-        // Record the location of the store that this instruction is
-        // dependent on.
-        unresolved_dependencies.storeDep = store_loc;
-
-        // If it's not already ready, then add it to the renamed
-        // list and the dependencies.
-        dep_it_t inst_loc =
-            (waitingInsts.insert(unresolved_dependencies)).first;
-
         // Add this instruction to the list of dependents.
-        (*store_loc).second.push_back(inst_loc);
+        store_entry->dependInsts.push_back(inst_entry);
 
-        assert(!(*store_loc).second.empty());
+//        inst_entry->producingStore = store_entry;
 
         if (inst->isLoad()) {
             ++conflictingLoads;
@@ -119,277 +183,288 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
     }
 
     if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
-                inst->readPC());
-
-        depPred.insertStore(inst->readPC(), inst_seq_num);
-
-        // Make sure this store isn't already in this list.
-        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
-
-        // Put a dependency entry in at the store's sequence number.
-        // Uh, not sure how this works...I want to create an entry but
-        // I don't have anything to put into the value yet.
-        storeDependents[inst_seq_num];
+        DPRINTF(MemDepUnit, "Inserting store PC %#x [sn:%lli].\n",
+                inst->readPC(), inst->seqNum);
 
-        assert(storeDependents.size() != 0);
+        depPred.insertStore(inst->readPC(), inst->seqNum, inst->threadNumber);
 
         ++insertedStores;
-
     } else if (inst->isLoad()) {
         ++insertedLoads;
     } else {
-        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+        panic("Unknown type! (most likely a barrier).");
     }
-
-    memInsts[inst_seq_num] = inst;
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
 {
-    InstSeqNum inst_seq_num = inst->seqNum;
+    unsigned tid = inst->threadNumber;
 
-    Dependency non_spec_inst(inst_seq_num);
+    MemDepEntryPtr inst_entry = new MemDepEntry(inst);
 
-    non_spec_inst.storeDep = storeDependents.end();
+    // Insert the MemDepEntry into the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
+    MemDepEntry::memdep_insert++;
 
-    waitingInsts.insert(non_spec_inst);
+    // Add the instruction to the list.
+    instList[tid].push_back(inst);
+
+    inst_entry->listIt = --(instList[tid].end());
 
     // Might want to turn this part into an inline function or something.
     // It's shared between both insert functions.
     if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
-                inst->readPC());
-
-        depPred.insertStore(inst->readPC(), inst_seq_num);
-
-        // Make sure this store isn't already in this list.
-        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
-
-        // Put a dependency entry in at the store's sequence number.
-        // Uh, not sure how this works...I want to create an entry but
-        // I don't have anything to put into the value yet.
-        storeDependents[inst_seq_num];
+        DPRINTF(MemDepUnit, "Inserting store PC %#x [sn:%lli].\n",
+                inst->readPC(), inst->seqNum);
 
-        assert(storeDependents.size() != 0);
+        depPred.insertStore(inst->readPC(), inst->seqNum, inst->threadNumber);
 
         ++insertedStores;
-
     } else if (inst->isLoad()) {
         ++insertedLoads;
     } else {
-        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+        panic("Unknown type! (most likely a barrier).");
     }
-
-    memInsts[inst_seq_num] = inst;
 }
 
 template <class MemDepPred, class Impl>
-typename Impl::DynInstPtr &
-MemDepUnit<MemDepPred, Impl>::top()
+void
+MemDepUnit<MemDepPred, Impl>::insertBarrier(DynInstPtr &barr_inst)
 {
-    topInst = memInsts.find( (*readyInsts.begin()) );
+    InstSeqNum barr_sn = barr_inst->seqNum;
+    if (barr_inst->isMemBarrier()) {
+        loadBarrier = true;
+        loadBarrierSN = barr_sn;
+        storeBarrier = true;
+        storeBarrierSN = barr_sn;
+        DPRINTF(MemDepUnit, "Inserted a memory barrier\n");
+    } else if (barr_inst->isWriteBarrier()) {
+        storeBarrier = true;
+        storeBarrierSN = barr_sn;
+        DPRINTF(MemDepUnit, "Inserted a write barrier\n");
+    }
+
+    unsigned tid = barr_inst->threadNumber;
 
-    DPRINTF(MemDepUnit, "MemDepUnit: Top instruction is PC %#x.\n",
-            (*topInst).second->readPC());
+    MemDepEntryPtr inst_entry = new MemDepEntry(barr_inst);
 
-    return (*topInst).second;
+    // Add the MemDepEntry to the hash.
+    memDepHash.insert(
+        std::pair<InstSeqNum, MemDepEntryPtr>(barr_sn, inst_entry));
+    MemDepEntry::memdep_insert++;
+
+    // Add the instruction to the instruction list.
+    instList[tid].push_back(barr_inst);
+
+    inst_entry->listIt = --(instList[tid].end());
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::pop()
+MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Removing instruction PC %#x.\n",
-            (*topInst).second->readPC());
+    DPRINTF(MemDepUnit, "Marking registers as ready for "
+            "instruction PC %#x [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
 
-    wakeDependents((*topInst).second);
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    issue((*topInst).second);
+    inst_entry->regsReady = true;
 
-    memInsts.erase(topInst);
+    if (inst_entry->memDepReady) {
+        DPRINTF(MemDepUnit, "Instruction has its memory "
+                "dependencies resolved, adding it to the ready list.\n");
 
-    topInst = memInsts.end();
+        moveToReady(inst_entry);
+    } else {
+        DPRINTF(MemDepUnit, "Instruction still waiting on "
+                "memory dependency.\n");
+    }
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Marking registers as ready for "
-            "instruction PC %#x.\n",
-            inst->readPC());
+    DPRINTF(MemDepUnit, "Marking non speculative "
+            "instruction PC %#x as ready [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
 
-    InstSeqNum inst_seq_num = inst->seqNum;
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    Dependency inst_to_find(inst_seq_num);
+    moveToReady(inst_entry);
+}
 
-    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::reschedule(DynInstPtr &inst)
+{
+    instsToReplay.push_back(inst);
+}
 
-    assert(waiting_inst != waitingInsts.end());
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::replay(DynInstPtr &inst)
+{
+    DynInstPtr temp_inst;
+    bool found_inst = false;
 
-    if ((*waiting_inst).memDepReady) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction has its memory "
-                "dependencies resolved, adding it to the ready list.\n");
+    while (!instsToReplay.empty()) {
+        temp_inst = instsToReplay.front();
 
-        moveToReady(waiting_inst);
-    } else {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction still waiting on "
-                "memory dependency.\n");
+        MemDepEntryPtr inst_entry = findInHash(temp_inst);
+
+        DPRINTF(MemDepUnit, "Replaying mem instruction PC %#x "
+                "[sn:%lli].\n",
+                temp_inst->readPC(), temp_inst->seqNum);
 
-        (*waiting_inst).regsReady = true;
+        moveToReady(inst_entry);
+
+        if (temp_inst == inst) {
+            found_inst = true;
+        }
+
+        instsToReplay.pop_front();
     }
+
+    assert(found_inst);
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Marking non speculative "
-            "instruction PC %#x as ready.\n",
-            inst->readPC());
+    DPRINTF(MemDepUnit, "Completed mem instruction PC %#x "
+            "[sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
+
+    unsigned tid = inst->threadNumber;
+
+    // Remove the instruction from the hash and the list.
+    MemDepHashIt hash_it = memDepHash.find(inst->seqNum);
 
-    InstSeqNum inst_seq_num = inst->seqNum;
+    assert(hash_it != memDepHash.end());
 
-    Dependency inst_to_find(inst_seq_num);
+    instList[tid].erase((*hash_it).second->listIt);
 
-    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+//    (*hash_it).second->inst = NULL;
 
-    assert(waiting_inst != waitingInsts.end());
+    (*hash_it).second = NULL;
 
-    moveToReady(waiting_inst);
+    memDepHash.erase(hash_it);
+    MemDepEntry::memdep_erase++;
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completeBarrier(DynInstPtr &inst)
 {
-    assert(readyInsts.find(inst->seqNum) != readyInsts.end());
-
-    DPRINTF(MemDepUnit, "MemDepUnit: Issuing instruction PC %#x.\n",
-            inst->readPC());
-
-    // Remove the instruction from the ready list.
-    readyInsts.erase(inst->seqNum);
-
-    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
+    wakeDependents(inst);
+    completed(inst);
+
+    InstSeqNum barr_sn = inst->seqNum;
+
+    if (inst->isMemBarrier()) {
+        assert(loadBarrier && storeBarrier);
+        if (loadBarrierSN == barr_sn)
+            loadBarrier = false;
+        if (storeBarrierSN == barr_sn)
+            storeBarrier = false;
+    } else if (inst->isWriteBarrier()) {
+        assert(storeBarrier);
+        if (storeBarrierSN == barr_sn)
+            storeBarrier = false;
+    }
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 {
-    // Only stores have dependents.
-    if (!inst->isStore()) {
+    // Only stores and barriers have dependents.
+    if (!inst->isStore() && !inst->isMemBarrier() && !inst->isWriteBarrier()) {
         return;
     }
 
-    // Wake any dependencies.
-    sd_it_t sd_it = storeDependents.find(inst->seqNum);
+    MemDepEntryPtr inst_entry = findInHash(inst);
 
-    // If there's no entry, then return.  Really there should only be
-    // no entry if the instruction is a load.
-    if (sd_it == storeDependents.end()) {
-        DPRINTF(MemDepUnit, "MemDepUnit: Instruction PC %#x, sequence "
-                "number %i has no dependents.\n",
-                inst->readPC(), inst->seqNum);
-
-        return;
-    }
+    for (int i = 0; i < inst_entry->dependInsts.size(); ++i ) {
+        MemDepEntryPtr woken_inst = inst_entry->dependInsts[i];
 
-    for (int i = 0; i < (*sd_it).second.size(); ++i ) {
-        dep_it_t woken_inst = (*sd_it).second[i];
-
-        DPRINTF(MemDepUnit, "MemDepUnit: Waking up a dependent inst, "
-                "sequence number %i.\n",
-                (*woken_inst).seqNum);
-#if 0
-        // Should we have reached instructions that are actually squashed,
-        // there will be no more useful instructions in this dependency
-        // list.  Break out early.
-        if (waitingInsts.find(woken_inst) == waitingInsts.end()) {
-            DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x "
-                    "are squashed, starting at SN %i.  Breaking early.\n",
-                    inst->readPC(), woken_inst);
-            break;
+        if (!woken_inst->inst) {
+            // Potentially removed mem dep entries could be on this list
+//            inst_entry->dependInsts[i] = NULL;
+            continue;
         }
-#endif
 
-        if ((*woken_inst).regsReady) {
+        DPRINTF(MemDepUnit, "Waking up a dependent inst, "
+                "[sn:%lli].\n",
+                woken_inst->inst->seqNum);
+
+        if (woken_inst->regsReady && !woken_inst->squashed) {
             moveToReady(woken_inst);
         } else {
-            (*woken_inst).memDepReady = true;
+            woken_inst->memDepReady = true;
         }
+//        inst_entry->dependInsts[i] = NULL;
     }
 
-    storeDependents.erase(sd_it);
+    inst_entry->dependInsts.clear();
 }
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
+MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
+                                     unsigned tid)
 {
-
-    if (!waitingInsts.empty()) {
-        dep_it_t waiting_it = waitingInsts.end();
-
-        --waiting_it;
-
-        // Remove entries from the renamed list as long as we haven't reached
-        // the end and the entries continue to be younger than the squashed.
-        while (!waitingInsts.empty() &&
-               (*waiting_it).seqNum > squashed_num)
-        {
-            if (!(*waiting_it).memDepReady &&
-                (*waiting_it).storeDep != storeDependents.end()) {
-                sd_it_t sd_it = (*waiting_it).storeDep;
-
-                // Make sure the iterator that the store has pointing
-                // back is actually to this instruction.
-                assert((*sd_it).second.back() == waiting_it);
-
-                // Now remove this from the store's list of dependent
-                // instructions.
-                (*sd_it).second.pop_back();
+    if (!instsToReplay.empty()) {
+        ListIt replay_it = instsToReplay.begin();
+        while (replay_it != instsToReplay.end()) {
+            if ((*replay_it)->threadNumber == tid &&
+                (*replay_it)->seqNum > squashed_num) {
+                instsToReplay.erase(replay_it++);
+            } else {
+                ++replay_it;
             }
-
-            waitingInsts.erase(waiting_it--);
         }
     }
 
-    if (!readyInsts.empty()) {
-        sn_it_t ready_it = readyInsts.end();
+    ListIt squash_it = instList[tid].end();
+    --squash_it;
 
-        --ready_it;
+    MemDepHashIt hash_it;
 
-        // Same for the ready list.
-        while (!readyInsts.empty() &&
-               (*ready_it) > squashed_num)
-        {
-            readyInsts.erase(ready_it--);
-        }
-    }
+    while (!instList[tid].empty() &&
+           (*squash_it)->seqNum > squashed_num) {
 
-    if (!storeDependents.empty()) {
-        sd_it_t dep_it = storeDependents.end();
+        DPRINTF(MemDepUnit, "Squashing inst [sn:%lli]\n",
+                (*squash_it)->seqNum);
 
-        --dep_it;
+        hash_it = memDepHash.find((*squash_it)->seqNum);
 
-        // Same for the dependencies list.
-        while (!storeDependents.empty() &&
-               (*dep_it).first > squashed_num)
-        {
-            // This store's list of dependent instructions should be empty.
-            assert((*dep_it).second.empty());
+        assert(hash_it != memDepHash.end());
 
-            storeDependents.erase(dep_it--);
+        (*hash_it).second->squashed = true;
+/*
+        for (int i = 0; i < (*hash_it).second->dependInsts.size(); ++i) {
+            (*hash_it).second->dependInsts[i] = NULL;
         }
+
+        (*hash_it).second->inst = NULL;
+*/
+        (*hash_it).second = NULL;
+
+        memDepHash.erase(hash_it);
+        MemDepEntry::memdep_erase++;
+
+        instList[tid].erase(squash_it--);
     }
 
     // Tell the dependency predictor to squash as well.
-    depPred.squash(squashed_num);
+    depPred.squash(squashed_num, tid);
 }
 
 template <class MemDepPred, class Impl>
@@ -397,23 +472,72 @@ void
 MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
                                         DynInstPtr &violating_load)
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Passing violating PCs to store sets,"
+    DPRINTF(MemDepUnit, "Passing violating PCs to store sets,"
             " load: %#x, store: %#x\n", violating_load->readPC(),
             store_inst->readPC());
     // Tell the memory dependence unit of the violation.
     depPred.violation(violating_load->readPC(), store_inst->readPC());
 }
 
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
+{
+    DPRINTF(MemDepUnit, "Issuing instruction PC %#x [sn:%lli].\n",
+            inst->readPC(), inst->seqNum);
+
+    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
+}
+
+template <class MemDepPred, class Impl>
+inline typename MemDepUnit<MemDepPred,Impl>::MemDepEntryPtr &
+MemDepUnit<MemDepPred, Impl>::findInHash(const DynInstPtr &inst)
+{
+    MemDepHashIt hash_it = memDepHash.find(inst->seqNum);
+
+    assert(hash_it != memDepHash.end());
+
+    return (*hash_it).second;
+}
+
 template <class MemDepPred, class Impl>
 inline void
-MemDepUnit<MemDepPred, Impl>::moveToReady(dep_it_t &woken_inst)
+MemDepUnit<MemDepPred, Impl>::moveToReady(MemDepEntryPtr &woken_inst_entry)
+{
+    DPRINTF(MemDepUnit, "Adding instruction [sn:%lli] "
+            "to the ready list.\n", woken_inst_entry->inst->seqNum);
+
+    assert(!woken_inst_entry->squashed);
+
+    iqPtr->addReadyMemInst(woken_inst_entry->inst);
+}
+
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::dumpLists()
 {
-    DPRINTF(MemDepUnit, "MemDepUnit: Adding instruction sequence number %i "
-            "to the ready list.\n", (*woken_inst).seqNum);
+    for (unsigned tid=0; tid < Impl::MaxThreads; tid++) {
+        cprintf("Instruction list %i size: %i\n",
+                tid, instList[tid].size());
+
+        ListIt inst_list_it = instList[tid].begin();
+        int num = 0;
+
+        while (inst_list_it != instList[tid].end()) {
+            cprintf("Instruction:%i\nPC:%#x\n[sn:%i]\n[tid:%i]\nIssued:%i\n"
+                    "Squashed:%i\n\n",
+                    num, (*inst_list_it)->readPC(),
+                    (*inst_list_it)->seqNum,
+                    (*inst_list_it)->threadNumber,
+                    (*inst_list_it)->isIssued(),
+                    (*inst_list_it)->isSquashed());
+            inst_list_it++;
+            ++num;
+        }
+    }
 
-    // Add it to the ready list.
-    readyInsts.insert((*woken_inst).seqNum);
+    cprintf("Memory dependence hash size: %i\n", memDepHash.size());
 
-    // Remove it from the waiting instructions.
-    waitingInsts.erase(woken_inst);
+    cprintf("Memory dependence entries: %i\n", MemDepEntry::memdep_count);
 }
diff --git a/cpu/o3/ras.cc b/cpu/o3/ras.cc
index 0a7d6ca63..5e7ef38ae 100644
--- a/cpu/o3/ras.cc
+++ b/cpu/o3/ras.cc
@@ -28,14 +28,17 @@
 
 #include "cpu/o3/ras.hh"
 
-ReturnAddrStack::ReturnAddrStack(unsigned _numEntries)
-    : numEntries(_numEntries), usedEntries(0),
-      tos(0)
+void
+ReturnAddrStack::init(unsigned _numEntries)
 {
-    addrStack = new Addr[numEntries];
+     numEntries  = _numEntries;
+     usedEntries = 0;
+     tos = 0;
+
+     addrStack.resize(numEntries);
 
-    for (int i = 0; i < numEntries; ++i)
-        addrStack[i] = 0;
+     for (int i = 0; i < numEntries; ++i)
+         addrStack[i] = 0;
 }
 
 void
@@ -53,9 +56,6 @@ ReturnAddrStack::push(const Addr &return_addr)
 void
 ReturnAddrStack::pop()
 {
-    // Not sure it's possible to really track usedEntries properly.
-//    assert(usedEntries > 0);
-
     if (usedEntries > 0) {
         --usedEntries;
     }
diff --git a/cpu/o3/ras.hh b/cpu/o3/ras.hh
index 46d98181e..5aa4fc05f 100644
--- a/cpu/o3/ras.hh
+++ b/cpu/o3/ras.hh
@@ -26,43 +26,68 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_RAS_HH__
-#define __CPU_O3_CPU_RAS_HH__
+#ifndef __CPU_O3_RAS_HH__
+#define __CPU_O3_RAS_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
+#include <vector>
 
+/** Return address stack class, implements a simple RAS. */
 class ReturnAddrStack
 {
   public:
-    ReturnAddrStack(unsigned numEntries);
+    /** Creates a return address stack, but init() must be called prior to
+     *  use.
+     */
+    ReturnAddrStack() {}
 
+    /** Initializes RAS with a specified number of entries.
+     *  @param numEntries Number of entries in the RAS.
+     */
+    void init(unsigned numEntries);
+
+    /** Returns the top address on the RAS. */
     Addr top()
     { return addrStack[tos]; }
 
+    /** Returns the index of the top of the RAS. */
     unsigned topIdx()
     { return tos; }
 
+    /** Pushes an address onto the RAS. */
     void push(const Addr &return_addr);
 
+    /** Pops the top address from the RAS. */
     void pop();
 
+    /** Changes index to the top of the RAS, and replaces the top address with
+     *  a new target.
+     *  @param top_entry_idx The index of the RAS that will now be the top.
+     *  @param restored_target The new target address of the new top of the RAS.
+     */
     void restore(unsigned top_entry_idx, const Addr &restored_target);
 
   private:
+    /** Increments the top of stack index. */
     inline void incrTos()
     { if (++tos == numEntries) tos = 0; }
 
+    /** Decrements the top of stack index. */
     inline void decrTos()
     { tos = (tos == 0 ? numEntries - 1 : tos - 1); }
 
-    Addr *addrStack;
+    /** The RAS itself. */
+    std::vector<Addr> addrStack;
 
+    /** The number of entries in the RAS. */
     unsigned numEntries;
 
+    /** The number of used entries in the RAS. */
     unsigned usedEntries;
 
+    /** The top of stack index. */
     unsigned tos;
 };
 
-#endif // __CPU_O3_CPU_RAS_HH__
+#endif // __CPU_O3_RAS_HH__
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index 1e6e10f29..78674c32c 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -26,10 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_REGFILE_HH__
-#define __CPU_O3_CPU_REGFILE_HH__
-
-// @todo: Destructor
+#ifndef __CPU_O3_REGFILE_HH__
+#define __CPU_O3_REGFILE_HH__
 
 #include "arch/isa_traits.hh"
 #include "arch/faults.hh"
@@ -42,11 +40,14 @@
 
 #endif
 
-// This really only depends on the ISA, and not the Impl.  It might be nicer
-// to see if I can make it depend on nothing...
-// Things that are in the ifdef FULL_SYSTEM are pretty dependent on the ISA,
-// and should go in the AlphaFullCPU.
+#include <vector>
 
+/**
+ * Simple physical register file class.
+ * This really only depends on the ISA, and not the Impl. Things that are
+ * in the ifdef FULL_SYSTEM are pretty dependent on the ISA, and probably
+ * should go in the AlphaFullCPU.
+ */
 template <class Impl>
 class PhysRegFile
 {
@@ -55,19 +56,18 @@ class PhysRegFile
     typedef TheISA::FloatReg FloatReg;
     typedef TheISA::MiscRegFile MiscRegFile;
     typedef TheISA::MiscReg MiscReg;
+    // Note that most of the definitions of the IntReg, FloatReg, etc. exist
+    // within the Impl/ISA class and not within this PhysRegFile class.
 
-    //Note that most of the definitions of the IntReg, FloatReg, etc. exist
-    //within the Impl/ISA class and not within this PhysRegFile class.
-
-    //Will need some way to allow stuff like swap_palshadow to access the
-    //correct registers.  Might require code changes to swap_palshadow and
-    //other execution contexts.
-
-    //Will make these registers public for now, but they probably should
-    //be private eventually with some accessor functions.
+    // Will make these registers public for now, but they probably should
+    // be private eventually with some accessor functions.
   public:
     typedef typename Impl::FullCPU FullCPU;
 
+    /**
+     * Constructs a physical register file with the specified amount of
+     * integer and floating point registers.
+     */
     PhysRegFile(unsigned _numPhysicalIntRegs,
                 unsigned _numPhysicalFloatRegs);
 
@@ -80,6 +80,7 @@ class PhysRegFile
 //    void serialize(std::ostream &os);
 //    void unserialize(Checkpoint *cp, const std::string &section);
 
+    /** Reads an integer register. */
     uint64_t readIntReg(PhysRegIndex reg_idx)
     {
         assert(reg_idx < numPhysicalIntRegs);
@@ -89,6 +90,7 @@ class PhysRegFile
         return intRegFile[reg_idx];
     }
 
+    /** Reads a floating point register (single precision). */
     float readFloatRegSingle(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -102,6 +104,7 @@ class PhysRegFile
         return (float)floatRegFile[reg_idx].d;
     }
 
+    /** Reads a floating point register (double precision). */
     double readFloatRegDouble(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -115,6 +118,7 @@ class PhysRegFile
         return floatRegFile[reg_idx].d;
     }
 
+    /** Reads a floating point register as an integer. */
     uint64_t readFloatRegInt(PhysRegIndex reg_idx)
     {
         // Remove the base Float reg dependency.
@@ -128,6 +132,7 @@ class PhysRegFile
         return floatRegFile[reg_idx].q;
     }
 
+    /** Sets an integer register to the given value. */
     void setIntReg(PhysRegIndex reg_idx, uint64_t val)
     {
         assert(reg_idx < numPhysicalIntRegs);
@@ -135,9 +140,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting int register %i to %lli\n",
                 int(reg_idx), val);
 
-        intRegFile[reg_idx] = val;
+        if (reg_idx != TheISA::ZeroReg)
+            intRegFile[reg_idx] = val;
     }
 
+    /** Sets a single precision floating point register to the given value. */
     void setFloatRegSingle(PhysRegIndex reg_idx, float val)
     {
         // Remove the base Float reg dependency.
@@ -148,9 +155,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].d = (double)val;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].d = (double)val;
     }
 
+    /** Sets a double precision floating point register to the given value. */
     void setFloatRegDouble(PhysRegIndex reg_idx, double val)
     {
         // Remove the base Float reg dependency.
@@ -161,9 +170,11 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].d = val;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].d = val;
     }
 
+    /** Sets a floating point register to the given integer value. */
     void setFloatRegInt(PhysRegIndex reg_idx, uint64_t val)
     {
         // Remove the base Float reg dependency.
@@ -174,78 +185,68 @@ class PhysRegFile
         DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n",
                 int(reg_idx), val);
 
-        floatRegFile[reg_idx].q = val;
-    }
-
-    uint64_t readPC()
-    {
-        return pc;
+        if (reg_idx != TheISA::ZeroReg)
+            floatRegFile[reg_idx].q = val;
     }
 
-    void setPC(uint64_t val)
+    //Consider leaving this stuff and below in some implementation specific
+    //file as opposed to the general register file.  Or have a derived class.
+    MiscReg readMiscReg(int misc_reg, unsigned thread_id)
     {
-        pc = val;
+        return miscRegs[thread_id].readReg(misc_reg);
     }
 
-    void setNextPC(uint64_t val)
+    MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault,
+                                  unsigned thread_id)
     {
-        npc = val;
+        return miscRegs[thread_id].readRegWithEffect(misc_reg, fault,
+                                                     cpu->xcProxies[thread_id]);
     }
 
-    //Consider leaving this stuff and below in some implementation specific
-    //file as opposed to the general register file.  Or have a derived class.
-    MiscReg readMiscReg(int misc_reg)
+    Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned thread_id)
     {
-        // Dummy function for now.
-        // @todo: Fix this once proxy XC is used.
-        return 0;
+        return miscRegs[thread_id].setReg(misc_reg, val);
     }
 
-    Fault setMiscReg(int misc_reg, const MiscReg &val)
+    Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val,
+                               unsigned thread_id)
     {
-        // Dummy function for now.
-        // @todo: Fix this once proxy XC is used.
-        return NoFault;
+        return miscRegs[thread_id].setRegWithEffect(misc_reg, val,
+                                                    cpu->xcProxies[thread_id]);
     }
 
 #if FULL_SYSTEM
     int readIntrFlag() { return intrflag; }
+    /** Sets an interrupt flag. */
     void setIntrFlag(int val) { intrflag = val; }
 #endif
 
-    // These should be private eventually, but will be public for now
-    // so that I can hack around the initregs issue.
   public:
     /** (signed) integer register file. */
-    IntReg *intRegFile;
+    std::vector<IntReg> intRegFile;
 
     /** Floating point register file. */
-    FloatReg *floatRegFile;
+    std::vector<FloatReg> floatRegFile;
 
     /** Miscellaneous register file. */
-    MiscRegFile miscRegs;
-
-    /** Program counter. */
-    Addr pc;
-
-    /** Next-cycle program counter. */
-    Addr npc;
+    MiscRegFile miscRegs[Impl::MaxThreads];
 
 #if FULL_SYSTEM
   private:
-    // This is ISA specifc stuff; remove it eventually once ISAImpl is used
-//    IntReg palregs[NumIntRegs];	// PAL shadow registers
     int intrflag;			// interrupt flag
-    bool pal_shadow;		// using pal_shadow registers
 #endif
 
   private:
+    /** CPU pointer. */
     FullCPU *cpu;
 
   public:
+    /** Sets the CPU pointer. */
     void setCPU(FullCPU *cpu_ptr) { cpu = cpu_ptr; }
 
+    /** Number of physical integer registers. */
     unsigned numPhysicalIntRegs;
+    /** Number of physical floating point registers. */
     unsigned numPhysicalFloatRegs;
 };
 
@@ -255,11 +256,11 @@ PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
     : numPhysicalIntRegs(_numPhysicalIntRegs),
       numPhysicalFloatRegs(_numPhysicalFloatRegs)
 {
-    intRegFile = new IntReg[numPhysicalIntRegs];
-    floatRegFile = new FloatReg[numPhysicalFloatRegs];
+    intRegFile.resize(numPhysicalIntRegs);
+    floatRegFile.resize(numPhysicalFloatRegs);
 
-    memset(intRegFile, 0, sizeof(*intRegFile));
-    memset(floatRegFile, 0, sizeof(*floatRegFile));
+    //memset(intRegFile, 0, sizeof(*intRegFile));
+    //memset(floatRegFile, 0, sizeof(*floatRegFile));
 }
 
-#endif // __CPU_O3_CPU_REGFILE_HH__
+#endif
diff --git a/cpu/o3/rename.cc b/cpu/o3/rename.cc
index 6e9ee23da..4dc3bf6b2 100644
--- a/cpu/o3/rename.cc
+++ b/cpu/o3/rename.cc
@@ -30,4 +30,4 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/rename_impl.hh"
 
-template class SimpleRename<AlphaSimpleImpl>;
+template class DefaultRename<AlphaSimpleImpl>;
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index 07b442964..d5beccde9 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -26,23 +26,27 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo:
-// Fix up trap and barrier handling.
-// May want to have different statuses to differentiate the different stall
-// conditions.
-
-#ifndef __CPU_O3_CPU_SIMPLE_RENAME_HH__
-#define __CPU_O3_CPU_SIMPLE_RENAME_HH__
+#ifndef __CPU_O3_RENAME_HH__
+#define __CPU_O3_RENAME_HH__
 
 #include <list>
 
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 
-// Will need rename maps for both the int reg file and fp reg file.
-// Or change rename map class to handle both. (RegFile handles both.)
+/**
+ * DefaultRename handles both single threaded and SMT rename. Its width is
+ * specified by the parameters; each cycle it tries to rename that many
+ * instructions. It holds onto the rename history of all instructions with
+ * destination registers, storing the arch. register, the new physical
+ * register, and the old physical register, to allow for undoing of mappings
+ * if squashing happens, or freeing up registers upon commit. Rename handles
+ * blocking if the ROB, IQ, or LSQ is going to be full. Rename also handles
+ * barriers, and does so by stalling on the instruction until the ROB is
+ * empty and there are no instructions in flight to the ROB.
+ */
 template<class Impl>
-class SimpleRename
+class DefaultRename
 {
   public:
     // Typedefs from the Impl.
@@ -51,25 +55,38 @@ class SimpleRename
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
-    typedef typename CPUPol::FetchStruct FetchStruct;
+    // Typedefs from the CPUPol
     typedef typename CPUPol::DecodeStruct DecodeStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
     typedef typename CPUPol::TimeStruct TimeStruct;
-
-    // Typedefs from the CPUPol
     typedef typename CPUPol::FreeList FreeList;
     typedef typename CPUPol::RenameMap RenameMap;
+    // These are used only for initialization.
+    typedef typename CPUPol::IEW IEW;
+    typedef typename CPUPol::Commit Commit;
 
     // Typedefs from the ISA.
     typedef TheISA::RegIndex RegIndex;
 
+    // A deque is used to queue the instructions.  Barrier insts must be
+    // added to the front of the deque, which is the only reason for using
+    // a deque instead of a queue. (Most other stages use a queue)
+    typedef std::list<DynInstPtr> InstQueue;
+
   public:
-    // Rename will block if ROB becomes full or issue queue becomes full,
-    // or there are no free registers to rename to.
-    // Only case where rename squashes is if IEW squashes.
-    enum Status {
+    /** Overall rename status. Used to determine if the CPU can deschedule
+     * itself due to a lack of activity.
+     */
+    enum RenameStatus {
+        Active,
+        Inactive
+    };
+
+    /** Individual thread status. */
+    enum ThreadStatus {
         Running,
         Idle,
+        StartSquash,
         Squashing,
         Blocked,
         Unblocking,
@@ -77,86 +94,191 @@ class SimpleRename
     };
 
   private:
-    Status _status;
+    /** Rename status. */
+    RenameStatus _status;
+
+    /** Per-thread status. */
+    ThreadStatus renameStatus[Impl::MaxThreads];
 
   public:
-    SimpleRename(Params &params);
+    /** DefaultRename constructor. */
+    DefaultRename(Params *params);
 
+    /** Returns the name of rename. */
+    std::string name() const;
+
+    /** Registers statistics. */
     void regStats();
 
+    /** Sets CPU pointer. */
     void setCPU(FullCPU *cpu_ptr);
 
+    /** Sets the main backwards communication time buffer pointer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    /** Sets pointer to time buffer used to communicate to the next stage. */
     void setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr);
 
+    /** Sets pointer to time buffer coming from decode. */
     void setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr);
 
-    void setRenameMap(RenameMap *rm_ptr);
+    /** Sets pointer to IEW stage. Used only for initialization. */
+    void setIEWStage(IEW *iew_stage)
+    { iew_ptr = iew_stage; }
+
+    /** Sets pointer to commit stage. Used only for initialization. */
+    void setCommitStage(Commit *commit_stage)
+    { commit_ptr = commit_stage; }
+
+  private:
+    /** Pointer to IEW stage. Used only for initialization. */
+    IEW *iew_ptr;
+
+    /** Pointer to commit stage. Used only for initialization. */
+    Commit *commit_ptr;
+
+  public:
+    /** Initializes variables for the stage. */
+    void initStage();
+
+    /** Sets pointer to list of active threads. */
+    void setActiveThreads(std::list<unsigned> *at_ptr);
+
+    /** Sets pointer to rename maps (per-thread structures). */
+    void setRenameMap(RenameMap rm_ptr[Impl::MaxThreads]);
 
+    /** Sets pointer to the free list. */
     void setFreeList(FreeList *fl_ptr);
 
-    void dumpHistory();
+    /** Sets pointer to the scoreboard. */
+    void setScoreboard(Scoreboard *_scoreboard);
 
-    void tick();
+    /** Squashes all instructions in a thread. */
+    void squash(unsigned tid);
 
-    void rename();
+    /** Ticks rename, which processes all input signals and attempts to rename
+     * as many instructions as possible.
+     */
+    void tick();
 
-    void squash();
+    /** Debugging function used to dump history buffer of renamings. */
+    void dumpHistory();
 
   private:
-    void block();
+    /** Determines what to do based on rename's current status.
+     * @param status_change rename() sets this variable if there was a status
+     * change (ie switching from blocking to unblocking).
+     * @param tid Thread id to rename instructions from.
+     */
+    void rename(bool &status_change, unsigned tid);
+
+    /** Renames instructions for the given thread. Also handles serializing
+     * instructions.
+     */
+    void renameInsts(unsigned tid);
+
+    /** Inserts unused instructions from a given thread into the skid buffer,
+     * to be renamed once rename unblocks.
+     */
+    void skidInsert(unsigned tid);
+
+    /** Separates instructions from decode into individual lists of instructions
+     * sorted by thread.
+     */
+    void sortInsts();
+
+    /** Returns if all of the skid buffers are empty. */
+    bool skidsEmpty();
+
+    /** Updates overall rename status based on all of the threads' statuses. */
+    void updateStatus();
+
+    /** Switches rename to blocking, and signals back that rename has become
+     * blocked.
+     * @return Returns true if there is a status change.
+     */
+    bool block(unsigned tid);
+
+    /** Switches rename to unblocking if the skid buffer is empty, and signals
+     * back that rename has unblocked.
+     * @return Returns true if there is a status change.
+     */
+    bool unblock(unsigned tid);
+
+    /** Executes actual squash, removing squashed instructions. */
+    void doSquash(unsigned tid);
 
-    inline void unblock();
+    /** Removes a committed instruction's rename history. */
+    void removeFromHistory(InstSeqNum inst_seq_num, unsigned tid);
 
-    void doSquash();
+    /** Renames the source registers of an instruction. */
+    inline void renameSrcRegs(DynInstPtr &inst, unsigned tid);
 
-    void removeFromHistory(InstSeqNum inst_seq_num);
+    /** Renames the destination registers of an instruction. */
+    inline void renameDestRegs(DynInstPtr &inst, unsigned tid);
 
-    inline void renameSrcRegs(DynInstPtr &inst);
+    /** Calculates the number of free ROB entries for a specific thread. */
+    inline int calcFreeROBEntries(unsigned tid);
 
-    inline void renameDestRegs(DynInstPtr &inst);
+    /** Calculates the number of free IQ entries for a specific thread. */
+    inline int calcFreeIQEntries(unsigned tid);
 
-    inline int calcFreeROBEntries();
+    /** Calculates the number of free LSQ entries for a specific thread. */
+    inline int calcFreeLSQEntries(unsigned tid);
 
-    inline int calcFreeIQEntries();
+    /** Returns the number of valid instructions coming from decode. */
+    unsigned validInsts();
 
-    /** Holds the previous information for each rename.
-     *  Note that often times the inst may have been deleted, so only access
-     *  the pointer for the address and do not dereference it.
+    /** Reads signals telling rename to block/unblock. */
+    void readStallSignals(unsigned tid);
+
+    /** Checks if any stages are telling rename to block. */
+    bool checkStall(unsigned tid);
+
+    void readFreeEntries(unsigned tid);
+
+    bool checkSignalsAndUpdate(unsigned tid);
+
+    /** Either serializes on the next instruction available in the InstQueue,
+     * or records that it must serialize on the next instruction to enter
+     * rename.
+     * @param inst_list The list of younger, unprocessed instructions for the
+     * thread that has the serializeAfter instruction.
+     * @param tid The thread id.
+     */
+    void serializeAfter(InstQueue &inst_list, unsigned tid);
+
+    /** Holds the information for each destination register rename. It holds
+     * the instruction's sequence number, the arch register, the old physical
+     * register for that arch. register, and the new physical register.
      */
     struct RenameHistory {
         RenameHistory(InstSeqNum _instSeqNum, RegIndex _archReg,
                       PhysRegIndex _newPhysReg, PhysRegIndex _prevPhysReg)
             : instSeqNum(_instSeqNum), archReg(_archReg),
-              newPhysReg(_newPhysReg), prevPhysReg(_prevPhysReg),
-              placeHolder(false)
-        {
-        }
-
-        /** Constructor used specifically for cases where a place holder
-         *  rename history entry is being made.
-         */
-        RenameHistory(InstSeqNum _instSeqNum)
-            : instSeqNum(_instSeqNum), archReg(0), newPhysReg(0),
-              prevPhysReg(0), placeHolder(true)
+              newPhysReg(_newPhysReg), prevPhysReg(_prevPhysReg)
         {
         }
 
+        /** The sequence number of the instruction that renamed. */
         InstSeqNum instSeqNum;
+        /** The architectural register index that was renamed. */
         RegIndex archReg;
+        /** The new physical register that the arch. register is renamed to. */
         PhysRegIndex newPhysReg;
+        /** The old physical register that the arch. register was renamed to. */
         PhysRegIndex prevPhysReg;
-        bool placeHolder;
     };
 
-    std::list<RenameHistory> historyBuffer;
+    /** A per-thread list of all destination register renames, used to either
+     * undo rename mappings or free old physical registers.
+     */
+    std::list<RenameHistory> historyBuffer[Impl::MaxThreads];
 
-    /** CPU interface. */
+    /** Pointer to CPU. */
     FullCPU *cpu;
 
-    // Interfaces to objects outside of rename.
-    /** Time buffer interface. */
+    /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
     /** Wire to get IEW's output from backwards time buffer. */
@@ -166,7 +288,6 @@ class SimpleRename
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
     /** Wire to write infromation heading to previous stages. */
-    // Might not be the best name as not only decode will read it.
     typename TimeBuffer<TimeStruct>::wire toDecode;
 
     /** Rename instruction queue. */
@@ -181,15 +302,71 @@ class SimpleRename
     /** Wire to get decode's output from decode queue. */
     typename TimeBuffer<DecodeStruct>::wire fromDecode;
 
+    /** Queue of all instructions coming from decode this cycle. */
+    InstQueue insts[Impl::MaxThreads];
+
     /** Skid buffer between rename and decode. */
-    std::queue<DecodeStruct> skidBuffer;
+    InstQueue skidBuffer[Impl::MaxThreads];
 
     /** Rename map interface. */
-    SimpleRenameMap *renameMap;
+    RenameMap *renameMap[Impl::MaxThreads];
 
     /** Free list interface. */
     FreeList *freeList;
 
+    /** Pointer to the list of active threads. */
+    std::list<unsigned> *activeThreads;
+
+    /** Pointer to the scoreboard. */
+    Scoreboard *scoreboard;
+
+    /** Count of instructions in progress that have been sent off to the IQ
+     * and ROB, but are not yet included in their occupancy counts.
+     */
+    int instsInProgress[Impl::MaxThreads];
+
+    /** Variable that tracks if decode has written to the time buffer this
+     * cycle. Used to tell CPU if there is activity this cycle.
+     */
+    bool wroteToTimeBuffer;
+
+    /** Structures whose free entries impact the amount of instructions that
+     * can be renamed.
+     */
+    struct FreeEntries {
+        unsigned iqEntries;
+        unsigned lsqEntries;
+        unsigned robEntries;
+    };
+
+    /** Per-thread tracking of the number of free entries of back-end
+     * structures.
+     */
+    FreeEntries freeEntries[Impl::MaxThreads];
+
+    /** Records if the ROB is empty. In SMT mode the ROB may be dynamically
+     * partitioned between threads, so the ROB must tell rename when it is
+     * empty.
+     */
+    bool emptyROB[Impl::MaxThreads];
+
+    /** Source of possible stalls. */
+    struct Stalls {
+        bool iew;
+        bool commit;
+    };
+
+    /** Tracks which stages are telling decode to stall. */
+    Stalls stalls[Impl::MaxThreads];
+
+    /** The barrier instruction that rename has stalled on. */
+    DynInstPtr barrierInst[Impl::MaxThreads];
+
+    /** Records if rename needs to serialize on the next instruction for any
+     * thread.
+     */
+    bool serializeOnNextInst[Impl::MaxThreads];
+
     /** Delay between iew and rename, in ticks. */
     int iewToRenameDelay;
 
@@ -207,27 +384,68 @@ class SimpleRename
      */
     unsigned commitWidth;
 
-    /** The instruction that rename is currently on.  It needs to have
-     *  persistent state so that when a stall occurs in the middle of a
-     *  group of instructions, it can restart at the proper instruction.
+    /** The index of the instruction in the time buffer to IEW that rename is
+     * currently using.
+     */
+    unsigned toIEWIndex;
+
+    /** Whether or not rename needs to block this cycle. */
+    bool blockThisCycle;
+
+    /** The number of threads active in rename. */
+    unsigned numThreads;
+
+    /** The maximum skid buffer size. */
+    unsigned skidBufferMax;
+
+    /** Enum to record the source of a structure full stall.  Can come from
+     * either ROB, IQ, LSQ, and it is priortized in that order.
+     */
+    enum FullSource {
+        ROB,
+        IQ,
+        LSQ,
+        NONE
+    };
+
+    /** Function used to increment the stat that corresponds to the source of
+     * the stall.
      */
-    unsigned numInst;
+    inline void incrFullStat(const FullSource &source);
 
+    /** Stat for total number of cycles spent squashing. */
     Stats::Scalar<> renameSquashCycles;
+    /** Stat for total number of cycles spent idle. */
     Stats::Scalar<> renameIdleCycles;
+    /** Stat for total number of cycles spent blocking. */
     Stats::Scalar<> renameBlockCycles;
+    /** Stat for total number of cycles spent stalling for a barrier. */
+    Stats::Scalar<> renameBarrierCycles;
+    /** Stat for total number of cycles spent running normally. */
+    Stats::Scalar<> renameRunCycles;
+    /** Stat for total number of cycles spent unblocking. */
     Stats::Scalar<> renameUnblockCycles;
+    /** Stat for total number of renamed instructions. */
     Stats::Scalar<> renameRenamedInsts;
+    /** Stat for total number of squashed instructions that rename discards. */
     Stats::Scalar<> renameSquashedInsts;
+    /** Stat for total number of times that the ROB starts a stall in rename. */
     Stats::Scalar<> renameROBFullEvents;
+    /** Stat for total number of times that the IQ starts a stall in rename. */
     Stats::Scalar<> renameIQFullEvents;
+    /** Stat for total number of times that the LSQ starts a stall in rename. */
+    Stats::Scalar<> renameLSQFullEvents;
+    /** Stat for total number of times that rename runs out of free registers
+     * to use to rename. */
     Stats::Scalar<> renameFullRegistersEvents;
+    /** Stat for total number of renamed destination registers. */
     Stats::Scalar<> renameRenamedOperands;
+    /** Stat for total number of source register rename lookups. */
     Stats::Scalar<> renameRenameLookups;
-    Stats::Scalar<> renameHBPlaceHolders;
+    /** Stat for total number of committed renaming mappings. */
     Stats::Scalar<> renameCommittedMaps;
+    /** Stat for total number of mappings that were undone due to a squash. */
     Stats::Scalar<> renameUndoneMaps;
-    Stats::Scalar<> renameValidUndoneMaps;
 };
 
-#endif // __CPU_O3_CPU_SIMPLE_RENAME_HH__
+#endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 2068b36ab..441118ef1 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -31,21 +31,51 @@
 #include "config/full_system.hh"
 #include "cpu/o3/rename.hh"
 
+using namespace std;
+
+template <class Impl>
+DefaultRename<Impl>::DefaultRename(Params *params)
+    : iewToRenameDelay(params->iewToRenameDelay),
+      decodeToRenameDelay(params->decodeToRenameDelay),
+      commitToRenameDelay(params->commitToRenameDelay),
+      renameWidth(params->renameWidth),
+      commitWidth(params->commitWidth),
+      numThreads(params->numberOfThreads)
+{
+    _status = Inactive;
+
+    for (int i=0; i< numThreads; i++) {
+        renameStatus[i] = Idle;
+
+        freeEntries[i].iqEntries = 0;
+        freeEntries[i].lsqEntries = 0;
+        freeEntries[i].robEntries = 0;
+
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        barrierInst[i] = NULL;
+
+        instsInProgress[i] = 0;
+
+        emptyROB[i] = true;
+
+        serializeOnNextInst[i] = false;
+    }
+
+    // @todo: Make into a parameter.
+    skidBufferMax = (2 * (iewToRenameDelay * params->decodeWidth)) + renameWidth;
+}
+
 template <class Impl>
-SimpleRename<Impl>::SimpleRename(Params &params)
-    : iewToRenameDelay(params.iewToRenameDelay),
-      decodeToRenameDelay(params.decodeToRenameDelay),
-      commitToRenameDelay(params.commitToRenameDelay),
-      renameWidth(params.renameWidth),
-      commitWidth(params.commitWidth),
-      numInst(0)
+std::string
+DefaultRename<Impl>::name() const
 {
-    _status = Idle;
+    return cpu->name() + ".rename";
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::regStats()
+DefaultRename<Impl>::regStats()
 {
     renameSquashCycles
         .name(name() + ".renameSquashCycles")
@@ -59,6 +89,14 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameBlockCycles")
         .desc("Number of cycles rename is blocking")
         .prereq(renameBlockCycles);
+    renameBarrierCycles
+        .name(name() + ".renameBarrierCycles")
+        .desc("Number of cycles rename is blocking due to a barrier stall")
+        .prereq(renameBarrierCycles);
+    renameRunCycles
+        .name(name() + ".renameRunCycles")
+        .desc("Number of cycles rename is running")
+        .prereq(renameIdleCycles);
     renameUnblockCycles
         .name(name() + ".renameUnblockCycles")
         .desc("Number of cycles rename is unblocking")
@@ -73,12 +111,16 @@ SimpleRename<Impl>::regStats()
         .prereq(renameSquashedInsts);
     renameROBFullEvents
         .name(name() + ".renameROBFullEvents")
-        .desc("Number of times rename has considered the ROB 'full'")
+        .desc("Number of times rename has blocked due to ROB full")
         .prereq(renameROBFullEvents);
     renameIQFullEvents
         .name(name() + ".renameIQFullEvents")
-        .desc("Number of times rename has considered the IQ 'full'")
+        .desc("Number of times rename has blocked due to IQ full")
         .prereq(renameIQFullEvents);
+    renameLSQFullEvents
+        .name(name() + ".renameLSQFullEvents")
+        .desc("Number of times rename has blocked due to LSQ full")
+        .prereq(renameLSQFullEvents);
     renameFullRegistersEvents
         .name(name() + ".renameFullRegisterEvents")
         .desc("Number of times there has been no free registers")
@@ -91,10 +133,6 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameRenameLookups")
         .desc("Number of register rename lookups that rename has made")
         .prereq(renameRenameLookups);
-    renameHBPlaceHolders
-        .name(name() + ".renameHBPlaceHolders")
-        .desc("Number of place holders added to the history buffer")
-        .prereq(renameHBPlaceHolders);
     renameCommittedMaps
         .name(name() + ".renameCommittedMaps")
         .desc("Number of HB maps that are committed")
@@ -103,25 +141,21 @@ SimpleRename<Impl>::regStats()
         .name(name() + ".renameUndoneMaps")
         .desc("Number of HB maps that are undone due to squashing")
         .prereq(renameUndoneMaps);
-    renameValidUndoneMaps
-        .name(name() + ".renameValidUndoneMaps")
-        .desc("Number of HB maps that are undone, and are not place holders")
-        .prereq(renameValidUndoneMaps);
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
+DefaultRename<Impl>::setCPU(FullCPU *cpu_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting CPU pointer.\n");
+    DPRINTF(Rename, "Setting CPU pointer.\n");
     cpu = cpu_ptr;
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
+DefaultRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting time buffer pointer.\n");
+    DPRINTF(Rename, "Setting time buffer pointer.\n");
     timeBuffer = tb_ptr;
 
     // Setup wire to read information from time buffer, from IEW stage.
@@ -136,9 +170,9 @@ SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
+DefaultRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting rename queue pointer.\n");
+    DPRINTF(Rename, "Setting rename queue pointer.\n");
     renameQueue = rq_ptr;
 
     // Setup wire to write information to future stages.
@@ -147,9 +181,9 @@ SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
+DefaultRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting decode queue pointer.\n");
+    DPRINTF(Rename, "Setting decode queue pointer.\n");
     decodeQueue = dq_ptr;
 
     // Setup wire to get information from decode.
@@ -158,214 +192,670 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
 template <class Impl>
 void
-SimpleRename<Impl>::setRenameMap(RenameMap *rm_ptr)
+DefaultRename<Impl>::initStage()
+{
+    for (int tid=0; tid < numThreads; tid++) {
+        freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
+        freeEntries[tid].lsqEntries = iew_ptr->ldstQueue.numFreeEntries(tid);
+        freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
+        emptyROB[tid] = true;
+    }
+
+    // Clear these pointers so they are not accidentally used in
+    // non-initialization code.
+    iew_ptr = NULL;
+    commit_ptr = NULL;
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    DPRINTF(Rename, "Rename: Setting rename map pointer.\n");
-    renameMap = rm_ptr;
+    DPRINTF(Rename, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
 }
 
+
 template <class Impl>
 void
-SimpleRename<Impl>::setFreeList(FreeList *fl_ptr)
+DefaultRename<Impl>::setRenameMap(RenameMap rm_ptr[])
 {
-    DPRINTF(Rename, "Rename: Setting free list pointer.\n");
+    DPRINTF(Rename, "Setting rename map pointers.\n");
+
+    for (int i=0; i<numThreads; i++) {
+        renameMap[i] = &rm_ptr[i];
+    }
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::setFreeList(FreeList *fl_ptr)
+{
+    DPRINTF(Rename, "Setting free list pointer.\n");
     freeList = fl_ptr;
 }
 
+template<class Impl>
+void
+DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
+{
+    DPRINTF(Rename, "Setting scoreboard pointer.\n");
+    scoreboard = _scoreboard;
+}
+
 template <class Impl>
 void
-SimpleRename<Impl>::dumpHistory()
+DefaultRename<Impl>::squash(unsigned tid)
 {
-    typename list<RenameHistory>::iterator buf_it = historyBuffer.begin();
+    DPRINTF(Rename, "[tid:%u]: Squashing instructions.\n",tid);
+
+    // Clear the stall signal if rename was blocked or unblocking before.
+    // If it still needs to block, the blocking should happen the next
+    // cycle and there should be space to hold everything due to the squash.
+    if (renameStatus[tid] == Blocked ||
+        renameStatus[tid] == Unblocking ||
+        renameStatus[tid] == BarrierStall) {
+#if !FULL_SYSTEM
+        // In syscall emulation, we can have both a block and a squash due
+        // to a syscall in the same cycle.  This would cause both signals to
+        // be high.  This shouldn't happen in full system.
+        if (toDecode->renameBlock[tid]) {
+            toDecode->renameBlock[tid] = 0;
+        } else {
+            toDecode->renameUnblock[tid] = 1;
+        }
+#else
+        toDecode->renameUnblock[tid] = 1;
+#endif
+        barrierInst[tid] = NULL;
+    }
 
-    while (buf_it != historyBuffer.end())
-    {
-        cprintf("Seq num: %i\nArch reg: %i New phys reg: %i Old phys "
-                "reg: %i\n", (*buf_it).instSeqNum, (int)(*buf_it).archReg,
-                (int)(*buf_it).newPhysReg, (int)(*buf_it).prevPhysReg);
+    // Set the status to Squashing.
+    renameStatus[tid] = Squashing;
+
+    // Clear the skid buffer in case it has any data in it.
+    unsigned squashCount = 0;
 
-        buf_it++;
+    for (int i=0; i<fromDecode->size; i++) {
+        if (fromDecode->insts[i]->threadNumber == tid) {
+            fromDecode->insts[i]->squashed = true;
+            wroteToTimeBuffer = true;
+            squashCount++;
+        }
     }
+
+    insts[tid].clear();
+
+    // Clear the skid buffer in case it has any data in it.
+    skidBuffer[tid].clear();
+
+    doSquash(tid);
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::block()
+DefaultRename<Impl>::tick()
 {
-    DPRINTF(Rename, "Rename: Blocking.\n");
-    // Set status to Blocked.
-    _status = Blocked;
+    // Rename will need to try to rename as many instructions as it
+    // has bandwidth, unless it is blocked.
 
-    // Add the current inputs onto the skid buffer, so they can be
-    // reprocessed when this stage unblocks.
-    skidBuffer.push(*fromDecode);
+    wroteToTimeBuffer = false;
+
+    blockThisCycle = false;
+
+    bool status_change = false;
+
+    toIEWIndex = 0;
+
+    sortInsts();
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    // Check stall and squash signals.
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        DPRINTF(Rename, "Processing [tid:%i]\n", tid);
+
+        status_change = checkSignalsAndUpdate(tid) || status_change;
+
+        rename(status_change, tid);
+    }
+
+    if (status_change) {
+        updateStatus();
+    }
+
+    if (wroteToTimeBuffer) {
+        DPRINTF(Activity, "Activity this cycle.\n");
+        cpu->activityThisCycle();
+    }
+
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        // If we committed this cycle then doneSeqNum will be > 0
+        if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
+            !fromCommit->commitInfo[tid].squash &&
+            renameStatus[tid] != Squashing) {
+
+            removeFromHistory(fromCommit->commitInfo[tid].doneSeqNum,
+                                  tid);
+        }
+    }
+
+    // @todo: make into updateProgress function
+    for (int tid=0; tid < numThreads; tid++) {
+        instsInProgress[tid] -= fromIEW->iewInfo[tid].dispatched;
+
+        assert(instsInProgress[tid] >=0);
+    }
 
-    // Note that this stage only signals previous stages to stall when
-    // it is the cause of the stall originates at this stage.  Otherwise
-    // the previous stages are expected to check all possible stall signals.
 }
 
-template <class Impl>
-inline void
-SimpleRename<Impl>::unblock()
-{
-    DPRINTF(Rename, "Rename: Read instructions out of skid buffer this "
-            "cycle.\n");
-    // Remove the now processed instructions from the skid buffer.
-    skidBuffer.pop();
-
-    // If there's still information in the skid buffer, then
-    // continue to tell previous stages to stall.  They will be
-    // able to restart once the skid buffer is empty.
-    if (!skidBuffer.empty()) {
-        toDecode->renameInfo.stall = true;
-    } else {
-        DPRINTF(Rename, "Rename: Done unblocking.\n");
-        _status = Running;
+template<class Impl>
+void
+DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
+{
+    // If status is Running or idle,
+    //     call renameInsts()
+    // If status is Unblocking,
+    //     buffer any instructions coming from decode
+    //     continue trying to empty skid buffer
+    //     check if stall conditions have passed
+
+    if (renameStatus[tid] == Blocked) {
+        ++renameBlockCycles;
+    } else if (renameStatus[tid] == Squashing) {
+        ++renameSquashCycles;
+    } else if (renameStatus[tid] == BarrierStall) {
+        ++renameBarrierCycles;
+    }
+
+    if (renameStatus[tid] == Running ||
+        renameStatus[tid] == Idle) {
+        DPRINTF(Rename, "[tid:%u]: Not blocked, so attempting to run "
+                "stage.\n", tid);
+
+        renameInsts(tid);
+    } else if (renameStatus[tid] == Unblocking) {
+        renameInsts(tid);
+
+        ++renameUnblockCycles;
+
+        if (validInsts()) {
+            // Add the current inputs to the skid buffer so they can be
+            // reprocessed when this stage unblocks.
+            skidInsert(tid);
+        }
+
+        // If we switched over to blocking, then there's a potential for
+        // an overall status change.
+        status_change = unblock(tid) || status_change || blockThisCycle;
     }
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::doSquash()
+DefaultRename<Impl>::renameInsts(unsigned tid)
 {
-    typename list<RenameHistory>::iterator hb_it = historyBuffer.begin();
+    // Instructions can be either in the skid buffer or the queue of
+    // instructions coming from decode, depending on the status.
+    int insts_available = renameStatus[tid] == Unblocking ?
+        skidBuffer[tid].size() : insts[tid].size();
 
-    InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum;
+    // Check the decode queue to see if instructions are available.
+    // If there are no available instructions to rename, then do nothing.
+    if (insts_available == 0) {
+        DPRINTF(Rename, "[tid:%u]: Nothing to do, breaking out early.\n",
+                tid);
+        // Should I change status to idle?
+        ++renameIdleCycles;
+        return;
+    } else if (renameStatus[tid] == Unblocking) {
+        ++renameUnblockCycles;
+    } else if (renameStatus[tid] == Running) {
+        ++renameRunCycles;
+    }
+
+    DynInstPtr inst;
+
+    // Will have to do a different calculation for the number of free
+    // entries.
+    int free_rob_entries = calcFreeROBEntries(tid);
+    int free_iq_entries  = calcFreeIQEntries(tid);
+    int free_lsq_entries = calcFreeLSQEntries(tid);
+    int min_free_entries = free_rob_entries;
+
+    FullSource source = ROB;
+
+    if (free_iq_entries < min_free_entries) {
+        min_free_entries = free_iq_entries;
+        source = IQ;
+    }
+
+    if (free_lsq_entries < min_free_entries) {
+        min_free_entries = free_lsq_entries;
+        source = LSQ;
+    }
+
+    // Check if there's any space left.
+    if (min_free_entries <= 0) {
+        DPRINTF(Rename, "[tid:%u]: Blocking due to no free ROB/IQ/LSQ "
+                "entries.\n"
+                "ROB has %i free entries.\n"
+                "IQ has %i free entries.\n"
+                "LSQ has %i free entries.\n",
+                tid,
+                free_rob_entries,
+                free_iq_entries,
+                free_lsq_entries);
+
+        blockThisCycle = true;
+
+        block(tid);
+
+        incrFullStat(source);
 
-#if FULL_SYSTEM
-    assert(!historyBuffer.empty());
-#else
-    // After a syscall squashes everything, the history buffer may be empty
-    // but the ROB may still be squashing instructions.
-    if (historyBuffer.empty()) {
         return;
+    } else if (min_free_entries < insts_available) {
+        DPRINTF(Rename, "[tid:%u]: Will have to block this cycle."
+                "%i insts available, but only %i insts can be "
+                "renamed due to ROB/IQ/LSQ limits.\n",
+                tid, insts_available, min_free_entries);
+
+        insts_available = min_free_entries;
+
+        blockThisCycle = true;
+
+        incrFullStat(source);
     }
-#endif // FULL_SYSTEM
 
-    // Go through the most recent instructions, undoing the mappings
-    // they did and freeing up the registers.
-    while ((*hb_it).instSeqNum > squashed_seq_num)
-    {
-        assert(hb_it != historyBuffer.end());
+    InstQueue &insts_to_rename = renameStatus[tid] == Unblocking ?
+        skidBuffer[tid] : insts[tid];
+
+    DPRINTF(Rename, "[tid:%u]: %i available instructions to "
+            "send iew.\n", tid, insts_available);
 
-        DPRINTF(Rename, "Rename: Removing history entry with sequence "
-                "number %i.\n", (*hb_it).instSeqNum);
+    DPRINTF(Rename, "[tid:%u]: %i insts pipelining from Rename | %i insts "
+            "dispatched to IQ last cycle.\n",
+            tid, instsInProgress[tid], fromIEW->iewInfo[tid].dispatched);
+
+    // Handle serializing the next instruction if necessary.
+    if (serializeOnNextInst[tid]) {
+        if (emptyROB[tid] && instsInProgress[tid] == 0) {
+            // ROB already empty; no need to serialize.
+            serializeOnNextInst[tid] = false;
+        } else if (!insts_to_rename.empty()) {
+            insts_to_rename.front()->setSerializeBefore();
+        }
+    }
 
-        // If it's not simply a place holder, then add the registers.
-        if (!(*hb_it).placeHolder) {
-            // Tell the rename map to set the architected register to the
-            // previous physical register that it was renamed to.
-            renameMap->setEntry(hb_it->archReg, hb_it->prevPhysReg);
+    int renamed_insts = 0;
 
-            // Put the renamed physical register back on the free list.
-            freeList->addReg(hb_it->newPhysReg);
+    while (insts_available > 0 &&  toIEWIndex < renameWidth) {
+        DPRINTF(Rename, "[tid:%u]: Sending instructions to IEW.\n", tid);
 
-            ++renameValidUndoneMaps;
+        assert(!insts_to_rename.empty());
+
+        inst = insts_to_rename.front();
+
+        insts_to_rename.pop_front();
+
+        //Use skidBuffer with oldest instructions
+        if (renameStatus[tid] == Unblocking) {
+            DPRINTF(Rename,"[tid:%u]: Removing [sn:%lli] PC:%#x from rename "
+                    "skidBuffer\n",
+                    tid, inst->seqNum, inst->readPC());
         }
 
-        historyBuffer.erase(hb_it++);
+        if (inst->isSquashed()) {
+            DPRINTF(Rename, "[tid:%u]: instruction %i with PC %#x is "
+                    "squashed, skipping.\n",
+                    tid, inst->seqNum, inst->threadNumber,inst->readPC());
 
-        ++renameUndoneMaps;
+            ++renameSquashedInsts;
+
+            // Decrement how many instructions are available.
+            --insts_available;
+
+            continue;
+        }
+
+        DPRINTF(Rename, "[tid:%u]: Processing instruction [sn:%lli] with "
+                "PC %#x.\n",
+                tid, inst->seqNum, inst->readPC());
+
+        // Handle serializeAfter/serializeBefore instructions.
+        // serializeAfter marks the next instruction as serializeBefore.
+        // serializeBefore makes the instruction wait in rename until the ROB
+        // is empty.
+        if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+            DPRINTF(Rename, "Serialize before instruction encountered.\n");
+
+            if (!inst->isTempSerializeBefore())
+                inst->setSerializeHandled();
+
+            // Change status over to BarrierStall so that other stages know
+            // what this is blocked on.
+            renameStatus[tid] = BarrierStall;
+
+            barrierInst[tid] = inst;
+
+            blockThisCycle = true;
+
+            break;
+        } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+            DPRINTF(Rename, "Serialize after instruction encountered.\n");
+
+            inst->setSerializeHandled();
+
+            serializeAfter(insts_to_rename, tid);
+        }
+
+        // Check here to make sure there are enough destination registers
+        // to rename to.  Otherwise block.
+        if (renameMap[tid]->numFreeEntries() < inst->numDestRegs()) {
+            DPRINTF(Rename, "Blocking due to lack of free "
+                    "physical registers to rename to.\n");
+            blockThisCycle = true;
+
+            ++renameFullRegistersEvents;
+
+            break;
+        }
+
+        renameSrcRegs(inst, inst->threadNumber);
+
+        renameDestRegs(inst, inst->threadNumber);
+
+        ++renamed_insts;
+
+        // Put instruction in rename queue.
+        toIEW->insts[toIEWIndex] = inst;
+        ++(toIEW->size);
+
+        // Increment which instruction we're on.
+        ++toIEWIndex;
+
+        ++renameRenamedInsts;
+
+        // Decrement how many instructions are available.
+        --insts_available;
+    }
+
+    instsInProgress[tid] += renamed_insts;
+
+    // If we wrote to the time buffer, record this.
+    if (toIEWIndex) {
+        wroteToTimeBuffer = true;
+    }
+
+    // Check if there's any instructions left that haven't yet been renamed.
+    // If so then block.
+    if (insts_available) {
+        blockThisCycle = true;
+    }
+
+    if (blockThisCycle) {
+        block(tid);
+        toDecode->renameUnblock[tid] = false;
+    }
+}
+
+template<class Impl>
+void
+DefaultRename<Impl>::skidInsert(unsigned tid)
+{
+    DynInstPtr inst = NULL;
+
+    while (!insts[tid].empty()) {
+        inst = insts[tid].front();
+
+        insts[tid].pop_front();
+
+        assert(tid == inst->threadNumber);
+
+        DPRINTF(Rename, "[tid:%u]: Inserting [sn:%lli] PC:%#x into Rename "
+                "skidBuffer\n", tid, inst->seqNum, inst->readPC());
+
+        skidBuffer[tid].push_back(inst);
     }
+
+    if (skidBuffer[tid].size() > skidBufferMax)
+        panic("Skidbuffer Exceeded Max Size");
 }
 
 template <class Impl>
 void
-SimpleRename<Impl>::squash()
+DefaultRename<Impl>::sortInsts()
 {
-    DPRINTF(Rename, "Rename: Squashing instructions.\n");
-    // Set the status to Squashing.
-    _status = Squashing;
+    int insts_from_decode = fromDecode->size;
 
-    numInst = 0;
+    for (int i=0; i < numThreads; i++)
+        assert(insts[i].empty());
 
-    // Clear the skid buffer in case it has any data in it.
-    while (!skidBuffer.empty())
-    {
-        skidBuffer.pop();
+    for (int i = 0; i < insts_from_decode; ++i) {
+        DynInstPtr inst = fromDecode->insts[i];
+        insts[inst->threadNumber].push_back(inst);
+    }
+}
+
+template<class Impl>
+bool
+DefaultRename<Impl>::skidsEmpty()
+{
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        if (!skidBuffer[*threads++].empty())
+            return false;
     }
 
-    doSquash();
+    return true;
 }
 
 template<class Impl>
 void
-SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
+DefaultRename<Impl>::updateStatus()
 {
-    DPRINTF(Rename, "Rename: Removing a committed instruction from the "
-            "history buffer, until sequence number %lli.\n", inst_seq_num);
-    typename list<RenameHistory>::iterator hb_it = historyBuffer.end();
+    bool any_unblocking = false;
 
-    --hb_it;
+    list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    if (hb_it->instSeqNum > inst_seq_num) {
-        DPRINTF(Rename, "Rename: Old sequence number encountered.  Ensure "
-                "that a syscall happened recently.\n");
-        return;
+    threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (renameStatus[tid] == Unblocking) {
+            any_unblocking = true;
+            break;
+        }
     }
 
-    while ((*hb_it).instSeqNum != inst_seq_num)
-    {
-        // Make sure we haven't gone off the end of the list.
-        assert(hb_it != historyBuffer.end());
+    // Rename will have activity if it's unblocking.
+    if (any_unblocking) {
+        if (_status == Inactive) {
+            _status = Active;
 
-        // In theory instructions at the end of the history buffer
-        // should be older than the instruction being removed, which
-        // means they will have a lower sequence number.  Also the
-        // instruction being removed from the history really should
-        // be the last instruction in the list, as it is the instruction
-        // that was just committed that is being removed.
-        assert(hb_it->instSeqNum < inst_seq_num);
-        DPRINTF(Rename, "Rename: Freeing up older rename of reg %i, sequence"
-                " number %i.\n",
-                (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
+            DPRINTF(Activity, "Activating stage.\n");
 
-        if (!(*hb_it).placeHolder) {
-            freeList->addReg((*hb_it).prevPhysReg);
-            ++renameCommittedMaps;
+            cpu->activateStage(FullCPU::RenameIdx);
         }
+    } else {
+        // If it's not unblocking, then rename will not have any internal
+        // activity.  Switch it to inactive.
+        if (_status == Active) {
+            _status = Inactive;
+            DPRINTF(Activity, "Deactivating stage.\n");
 
-        historyBuffer.erase(hb_it--);
+            cpu->deactivateStage(FullCPU::RenameIdx);
+        }
     }
+}
 
-    // Finally free up the previous register of the finished instruction
-    // itself.
-    if (!(*hb_it).placeHolder) {
-        freeList->addReg(hb_it->prevPhysReg);
-        ++renameCommittedMaps;
+template <class Impl>
+bool
+DefaultRename<Impl>::block(unsigned tid)
+{
+    DPRINTF(Rename, "[tid:%u]: Blocking.\n", tid);
+
+    // Add the current inputs onto the skid buffer, so they can be
+    // reprocessed when this stage unblocks.
+    skidInsert(tid);
+
+    // Only signal backwards to block if the previous stages do not think
+    // rename is already blocked.
+    if (renameStatus[tid] != Blocked) {
+        if (renameStatus[tid] != Unblocking) {
+            toDecode->renameBlock[tid] = true;
+            toDecode->renameUnblock[tid] = false;
+            wroteToTimeBuffer = true;
+        }
+
+        // Rename can not go from BarrierStall to Blocked, otherwise it would
+        // not know to complete the barrier stall.
+        if (renameStatus[tid] != BarrierStall) {
+            // Set status to Blocked.
+            renameStatus[tid] = Blocked;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+template <class Impl>
+bool
+DefaultRename<Impl>::unblock(unsigned tid)
+{
+    DPRINTF(Rename, "[tid:%u]: Trying to unblock.\n", tid);
+
+    // Rename is done unblocking if the skid buffer is empty.
+    if (skidBuffer[tid].empty() && renameStatus[tid] != BarrierStall) {
+
+        DPRINTF(Rename, "[tid:%u]: Done unblocking.\n", tid);
+
+        toDecode->renameUnblock[tid] = true;
+        wroteToTimeBuffer = true;
+
+        renameStatus[tid] = Running;
+        return true;
+    }
+
+    return false;
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::doSquash(unsigned tid)
+{
+    typename list<RenameHistory>::iterator hb_it = historyBuffer[tid].begin();
+
+    InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
+
+//#if FULL_SYSTEM
+//    assert(!historyBuffer[tid].empty());
+//#else
+    // After a syscall squashes everything, the history buffer may be empty
+    // but the ROB may still be squashing instructions.
+    if (historyBuffer[tid].empty()) {
+        return;
+    }
+//#endif // FULL_SYSTEM
+
+    // Go through the most recent instructions, undoing the mappings
+    // they did and freeing up the registers.
+    while (!historyBuffer[tid].empty() &&
+           (*hb_it).instSeqNum > squashed_seq_num) {
+        assert(hb_it != historyBuffer[tid].end());
+
+        DPRINTF(Rename, "[tid:%u]: Removing history entry with sequence "
+                "number %i.\n", tid, (*hb_it).instSeqNum);
+
+        // Tell the rename map to set the architected register to the
+        // previous physical register that it was renamed to.
+        renameMap[tid]->setEntry(hb_it->archReg, hb_it->prevPhysReg);
+
+        // Put the renamed physical register back on the free list.
+        freeList->addReg(hb_it->newPhysReg);
+
+        historyBuffer[tid].erase(hb_it++);
+
+        ++renameUndoneMaps;
     }
+}
 
-    historyBuffer.erase(hb_it);
+template<class Impl>
+void
+DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, unsigned tid)
+{
+    DPRINTF(Rename, "[tid:%u]: Removing a committed instruction from the "
+            "history buffer %u (size=%i), until [sn:%lli].\n",
+            tid, tid, historyBuffer[tid].size(), inst_seq_num);
+
+    typename list<RenameHistory>::iterator hb_it = historyBuffer[tid].end();
+
+    --hb_it;
+
+    if (historyBuffer[tid].empty()) {
+        DPRINTF(Rename, "[tid:%u]: History buffer is empty.\n", tid);
+        return;
+    } else if (hb_it->instSeqNum > inst_seq_num) {
+        DPRINTF(Rename, "[tid:%u]: Old sequence number encountered.  Ensure "
+                "that a syscall happened recently.\n", tid);
+        return;
+    }
+
+    // Commit all the renames up until (and including) the committed sequence
+    // number. Some or even all of the committed instructions may not have
+    // rename histories if they did not have destination registers that were
+    // renamed.
+    while (!historyBuffer[tid].empty() &&
+           hb_it != historyBuffer[tid].end() &&
+           (*hb_it).instSeqNum <= inst_seq_num) {
+
+        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, sequence"
+                " number %i.\n",
+                tid, (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
+
+        freeList->addReg((*hb_it).prevPhysReg);
+        ++renameCommittedMaps;
+
+        historyBuffer[tid].erase(hb_it--);
+    }
 }
 
 template <class Impl>
 inline void
-SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
+DefaultRename<Impl>::renameSrcRegs(DynInstPtr &inst,unsigned tid)
 {
+    assert(renameMap[tid] != 0);
+
     unsigned num_src_regs = inst->numSrcRegs();
 
     // Get the architectual register numbers from the source and
     // destination operands, and redirect them to the right register.
     // Will need to mark dependencies though.
-    for (int src_idx = 0; src_idx < num_src_regs; src_idx++)
-    {
+    for (int src_idx = 0; src_idx < num_src_regs; src_idx++) {
         RegIndex src_reg = inst->srcRegIdx(src_idx);
 
         // Look up the source registers to get the phys. register they've
         // been renamed to, and set the sources to those registers.
-        PhysRegIndex renamed_reg = renameMap->lookup(src_reg);
+        PhysRegIndex renamed_reg = renameMap[tid]->lookup(src_reg);
 
-        DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
-                "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
+        DPRINTF(Rename, "[tid:%u]: Looking up arch reg %i, got "
+                "physical reg %i.\n", tid, (int)src_reg,
+                (int)renamed_reg);
 
         inst->renameSrcReg(src_idx, renamed_reg);
 
-        // Either incorporate it into the info passed back,
-        // or make another function call to see if that register is
-        // ready or not.
-        if (renameMap->isReady(renamed_reg)) {
-            DPRINTF(Rename, "Rename: Register is ready.\n");
+        // See if the register is ready or not.
+        if (scoreboard->getReg(renamed_reg) == true) {
+            DPRINTF(Rename, "[tid:%u]: Register is ready.\n", tid);
 
             inst->markSrcRegReady(src_idx);
         }
@@ -376,379 +866,341 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
 
 template <class Impl>
 inline void
-SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
+DefaultRename<Impl>::renameDestRegs(DynInstPtr &inst,unsigned tid)
 {
-    typename SimpleRenameMap::RenameInfo rename_result;
+    typename RenameMap::RenameInfo rename_result;
 
     unsigned num_dest_regs = inst->numDestRegs();
 
-    // If it's an instruction with no destination registers, then put
-    // a placeholder within the history buffer.  It might be better
-    // to not put it in the history buffer at all (other than branches,
-    // which always need at least a place holder), and differentiate
-    // between instructions with and without destination registers
-    // when getting from commit the instructions that committed.
-    if (num_dest_regs == 0) {
-        RenameHistory hb_entry(inst->seqNum);
+    // Rename the destination registers.
+    for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++) {
+        RegIndex dest_reg = inst->destRegIdx(dest_idx);
 
-        historyBuffer.push_front(hb_entry);
+        // Get the physical register that the destination will be
+        // renamed to.
+        rename_result = renameMap[tid]->rename(dest_reg);
 
-        DPRINTF(Rename, "Rename: Adding placeholder instruction to "
-                "history buffer, sequence number %lli.\n",
-                inst->seqNum);
+        //Mark Scoreboard entry as not ready
+        scoreboard->unsetReg(rename_result.first);
 
-        ++renameHBPlaceHolders;
-    } else {
-
-        // Rename the destination registers.
-        for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
-        {
-            RegIndex dest_reg = inst->destRegIdx(dest_idx);
-
-            // Get the physical register that the destination will be
-            // renamed to.
-            rename_result = renameMap->rename(dest_reg);
+        DPRINTF(Rename, "[tid:%u]: Renaming arch reg %i to physical "
+                "reg %i.\n", tid, (int)dest_reg,
+                (int)rename_result.first);
 
-            DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
-                    "reg %i.\n", (int)dest_reg,
-                    (int)rename_result.first);
+        // Record the rename information so that a history can be kept.
+        RenameHistory hb_entry(inst->seqNum, dest_reg,
+                               rename_result.first,
+                               rename_result.second);
 
-            // Record the rename information so that a history can be kept.
-            RenameHistory hb_entry(inst->seqNum, dest_reg,
-                                   rename_result.first,
-                                   rename_result.second);
+        historyBuffer[tid].push_front(hb_entry);
 
-            historyBuffer.push_front(hb_entry);
+        DPRINTF(Rename, "[tid:%u]: Adding instruction to history buffer, "
+                "[sn:%lli].\n",tid,
+                (*historyBuffer[tid].begin()).instSeqNum);
 
-            DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
-                    "sequence number %lli.\n",
-                    (*historyBuffer.begin()).instSeqNum);
+        // Tell the instruction to rename the appropriate destination
+        // register (dest_idx) to the new physical register
+        // (rename_result.first), and record the previous physical
+        // register that the same logical register was renamed to
+        // (rename_result.second).
+        inst->renameDestReg(dest_idx,
+                            rename_result.first,
+                            rename_result.second);
 
-            // Tell the instruction to rename the appropriate destination
-            // register (dest_idx) to the new physical register
-            // (rename_result.first), and record the previous physical
-            // register that the same logical register was renamed to
-            // (rename_result.second).
-            inst->renameDestReg(dest_idx,
-                                rename_result.first,
-                                rename_result.second);
-
-            ++renameRenamedOperands;
-        }
+        ++renameRenamedOperands;
     }
 }
 
 template <class Impl>
 inline int
-SimpleRename<Impl>::calcFreeROBEntries()
+DefaultRename<Impl>::calcFreeROBEntries(unsigned tid)
 {
-    return fromCommit->commitInfo.freeROBEntries -
-        renameWidth * iewToRenameDelay;
+    int num_free = freeEntries[tid].robEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatched);
+
+    //DPRINTF(Rename,"[tid:%i]: %i rob free\n",tid,num_free);
+
+    return num_free;
 }
 
 template <class Impl>
 inline int
-SimpleRename<Impl>::calcFreeIQEntries()
-{
-    return fromIEW->iewInfo.freeIQEntries - renameWidth * iewToRenameDelay;
-}
-
-template<class Impl>
-void
-SimpleRename<Impl>::tick()
+DefaultRename<Impl>::calcFreeIQEntries(unsigned tid)
 {
-    // Rename will need to try to rename as many instructions as it
-    // has bandwidth, unless it is blocked.
-
-    // Check if _status is BarrierStall.  If so, then check if the number
-    // of free ROB entries is equal to the number of total ROB entries.
-    // Once equal then wake this stage up.  Set status to unblocking maybe.
+    int num_free = freeEntries[tid].iqEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatched);
 
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(Rename, "Rename: Status is not blocked, will attempt to "
-                        "run stage.\n");
-        // Make sure that the skid buffer has something in it if the
-        // status is unblocking.
-        assert(_status == Unblocking ? !skidBuffer.empty() : 1);
+    //DPRINTF(Rename,"[tid:%i]: %i iq free\n",tid,num_free);
 
-        rename();
+    return num_free;
+}
 
-        // If the status was unblocking, then instructions from the skid
-        // buffer were used.  Remove those instructions and handle
-        // the rest of unblocking.
-        if (_status == Unblocking) {
-            ++renameUnblockCycles;
+template <class Impl>
+inline int
+DefaultRename<Impl>::calcFreeLSQEntries(unsigned tid)
+{
+    int num_free = freeEntries[tid].lsqEntries -
+                  (instsInProgress[tid] - fromIEW->iewInfo[tid].dispatchedToLSQ);
 
-            if (fromDecode->size > 0) {
-                // Add the current inputs onto the skid buffer, so they can be
-                // reprocessed when this stage unblocks.
-                skidBuffer.push(*fromDecode);
-            }
+    //DPRINTF(Rename,"[tid:%i]: %i lsq free\n",tid,num_free);
 
-            unblock();
-        }
-    } else if (_status == Blocked) {
-        ++renameBlockCycles;
+    return num_free;
+}
 
-        // If stage is blocked and still receiving valid instructions,
-        // make sure to store them in the skid buffer.
-        if (fromDecode->size > 0) {
+template <class Impl>
+unsigned
+DefaultRename<Impl>::validInsts()
+{
+    unsigned inst_count = 0;
 
-            block();
+    for (int i=0; i<fromDecode->size; i++) {
+        if (!fromDecode->insts[i]->squashed)
+            inst_count++;
+    }
 
-            // Continue to tell previous stage to stall.
-            toDecode->renameInfo.stall = true;
-        }
+    return inst_count;
+}
 
-        if (!fromIEW->iewInfo.stall &&
-            !fromCommit->commitInfo.stall &&
-            calcFreeROBEntries() > 0 &&
-            calcFreeIQEntries() > 0 &&
-            renameMap->numFreeEntries() > 0) {
-
-            // Need to be sure to check all blocking conditions above.
-            // If they have cleared, then start unblocking.
-            DPRINTF(Rename, "Rename: Stall signals cleared, going to "
-                    "unblock.\n");
-            _status = Unblocking;
-
-            // Continue to tell previous stage to block until this stage
-            // is done unblocking.
-            toDecode->renameInfo.stall = true;
-        } else {
-            // Otherwise no conditions have changed.  Tell previous
-            // stage to continue blocking.
-            toDecode->renameInfo.stall = true;
-        }
+template <class Impl>
+void
+DefaultRename<Impl>::readStallSignals(unsigned tid)
+{
+    if (fromIEW->iewBlock[tid]) {
+        stalls[tid].iew = true;
+    }
 
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-            return;
-        }
-    } else if (_status == Squashing) {
-        ++renameSquashCycles;
+    if (fromIEW->iewUnblock[tid]) {
+        assert(stalls[tid].iew);
+        stalls[tid].iew = false;
+    }
 
-        if (fromCommit->commitInfo.squash) {
-            squash();
-        } else if (!fromCommit->commitInfo.squash &&
-                   !fromCommit->commitInfo.robSquashing) {
+    if (fromCommit->commitBlock[tid]) {
+        stalls[tid].commit = true;
+    }
 
-            DPRINTF(Rename, "Rename: Done squashing, going to running.\n");
-            _status = Running;
-            rename();
-        } else {
-            doSquash();
-        }
+    if (fromCommit->commitUnblock[tid]) {
+        assert(stalls[tid].commit);
+        stalls[tid].commit = false;
     }
+}
 
-    // Ugly code, revamp all of the tick() functions eventually.
-    if (fromCommit->commitInfo.doneSeqNum != 0 && _status != Squashing) {
-#if !FULL_SYSTEM
-        if (!fromCommit->commitInfo.squash) {
-            removeFromHistory(fromCommit->commitInfo.doneSeqNum);
-        }
-#else
-        removeFromHistory(fromCommit->commitInfo.doneSeqNum);
-#endif
+template <class Impl>
+bool
+DefaultRename<Impl>::checkStall(unsigned tid)
+{
+    bool ret_val = false;
+
+    if (stalls[tid].iew) {
+        DPRINTF(Rename,"[tid:%i]: Stall from IEW stage detected.\n", tid);
+        ret_val = true;
+    } else if (stalls[tid].commit) {
+        DPRINTF(Rename,"[tid:%i]: Stall from Commit stage detected.\n", tid);
+        ret_val = true;
+    } else if (calcFreeROBEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: ROB has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (calcFreeIQEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: IQ has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (calcFreeLSQEntries(tid) <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: LSQ has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (renameMap[tid]->numFreeEntries() <= 0) {
+        DPRINTF(Rename,"[tid:%i]: Stall: RenameMap has 0 free entries.\n", tid);
+        ret_val = true;
+    } else if (renameStatus[tid] == BarrierStall &&
+               (!emptyROB[tid] || instsInProgress[tid])) {
+        DPRINTF(Rename,"[tid:%i]: Stall: Barrier stall and ROB is not "
+                "empty.\n",
+                tid);
+        ret_val = true;
     }
 
+    return ret_val;
 }
 
-template<class Impl>
+template <class Impl>
 void
-SimpleRename<Impl>::rename()
-{
-    // Check if any of the stages ahead of rename are telling rename
-    // to squash.  The squash() function will also take care of fixing up
-    // the rename map and the free list.
-    if (fromCommit->commitInfo.squash ||
-        fromCommit->commitInfo.robSquashing) {
-        DPRINTF(Rename, "Rename: Receiving signal from Commit to squash.\n");
-        squash();
-        return;
-    }
-
-    // Check if time buffer is telling this stage to stall.
-    if (fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall) {
-        DPRINTF(Rename, "Rename: Receiving signal from IEW/Commit to "
-                        "stall.\n");
-        block();
-        return;
+DefaultRename<Impl>::readFreeEntries(unsigned tid)
+{
+    bool updated = false;
+    if (fromIEW->iewInfo[tid].usedIQ) {
+        freeEntries[tid].iqEntries =
+            fromIEW->iewInfo[tid].freeIQEntries;
+        updated = true;
     }
 
-    // Check if the current status is squashing.  If so, set its status
-    // to running and resume execution the next cycle.
-    if (_status == Squashing) {
-        DPRINTF(Rename, "Rename: Done squashing.\n");
-        _status = Running;
-        return;
+    if (fromIEW->iewInfo[tid].usedLSQ) {
+        freeEntries[tid].lsqEntries =
+            fromIEW->iewInfo[tid].freeLSQEntries;
+        updated = true;
     }
 
-    // Check the decode queue to see if instructions are available.
-    // If there are no available instructions to rename, then do nothing.
-    // Or, if the stage is currently unblocking, then go ahead and run it.
-    if (fromDecode->size == 0 && _status != Unblocking) {
-        DPRINTF(Rename, "Rename: Nothing to do, breaking out early.\n");
-        // Should I change status to idle?
-        return;
+    if (fromCommit->commitInfo[tid].usedROB) {
+        freeEntries[tid].robEntries =
+            fromCommit->commitInfo[tid].freeROBEntries;
+        emptyROB[tid] = fromCommit->commitInfo[tid].emptyROB;
+        updated = true;
     }
 
-    ////////////////////////////////////
-    // Actual rename part.
-    ////////////////////////////////////
+    DPRINTF(Rename, "[tid:%i]: Free IQ: %i, Free ROB: %i, Free LSQ: %i\n",
+            tid,
+            freeEntries[tid].iqEntries,
+            freeEntries[tid].robEntries,
+            freeEntries[tid].lsqEntries);
 
-    DynInstPtr inst;
-
-    // If we're unblocking, then we may be in the middle of an instruction
-    // group.  Subtract off numInst to get the proper number of instructions
-    // left.
-    int insts_available = _status == Unblocking ?
-        skidBuffer.front().size - numInst :
-        fromDecode->size;
+    DPRINTF(Rename, "[tid:%i]: %i instructions not yet in ROB\n",
+            tid, instsInProgress[tid]);
+}
 
-    bool block_this_cycle = false;
+template <class Impl>
+bool
+DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
+{
+    // Check if there's a squash signal, squash if there is
+    // Check stall signals, block if necessary.
+    // If status was blocked
+    //     check if stall conditions have passed
+    //         if so then go to unblocking
+    // If status was Squashing
+    //     check if squashing is not high.  Switch to running this cycle.
+    // If status was barrier stall
+    //     check if ROB is empty and no insts are in flight to the ROB
+
+    readFreeEntries(tid);
+    readStallSignals(tid);
+
+    if (fromCommit->commitInfo[tid].squash) {
+        DPRINTF(Rename, "[tid:%u]: Squashing instructions due to squash from "
+                "commit.\n", tid);
+
+        squash(tid);
+
+        return true;
+    }
 
-    // Will have to do a different calculation for the number of free
-    // entries.  Number of free entries recorded on this cycle -
-    // renameWidth * renameToDecodeDelay
-    int free_rob_entries = calcFreeROBEntries();
-    int free_iq_entries = calcFreeIQEntries();
-    int min_iq_rob = min(free_rob_entries, free_iq_entries);
+    if (fromCommit->commitInfo[tid].robSquashing) {
+        DPRINTF(Rename, "[tid:%u]: ROB is still squashing.\n", tid);
 
-    unsigned to_iew_index = 0;
+        renameStatus[tid] = Squashing;
 
-    // Check if there's any space left.
-    if (min_iq_rob <= 0) {
-        DPRINTF(Rename, "Rename: Blocking due to no free ROB or IQ "
-                "entries.\n"
-                "Rename: ROB has %d free entries.\n"
-                "Rename: IQ has %d free entries.\n",
-                free_rob_entries,
-                free_iq_entries);
-        block();
-        // Tell previous stage to stall.
-        toDecode->renameInfo.stall = true;
+        return true;
+    }
 
-        if (free_rob_entries <= 0) {
-            ++renameROBFullEvents;
-        } else {
-            ++renameIQFullEvents;
-        }
+    if (checkStall(tid)) {
+        return block(tid);
+    }
 
-        return;
-    } else if (min_iq_rob < insts_available) {
-        DPRINTF(Rename, "Rename: Will have to block this cycle.  Only "
-                "%i insts can be renamed due to IQ/ROB limits.\n",
-                min_iq_rob);
+    if (renameStatus[tid] == Blocked) {
+        DPRINTF(Rename, "[tid:%u]: Done blocking, switching to unblocking.\n",
+                tid);
 
-        insts_available = min_iq_rob;
+        renameStatus[tid] = Unblocking;
 
-        block_this_cycle = true;
+        unblock(tid);
 
-        if (free_rob_entries < free_iq_entries) {
-            ++renameROBFullEvents;
-        } else {
-            ++renameIQFullEvents;
-        }
+        return true;
     }
 
-    while (insts_available > 0) {
-        DPRINTF(Rename, "Rename: Sending instructions to iew.\n");
-
-        // Get the next instruction either from the skid buffer or the
-        // decode queue.
-        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
-               fromDecode->insts[numInst];
+    if (renameStatus[tid] == Squashing) {
+        // Switch status to running if rename isn't being told to block or
+        // squash this cycle.
+        DPRINTF(Rename, "[tid:%u]: Done squashing, switching to running.\n",
+                tid);
 
-        if (inst->isSquashed()) {
-            DPRINTF(Rename, "Rename: instruction %i with PC %#x is "
-                    "squashed, skipping.\n",
-                    inst->seqNum, inst->readPC());
+        renameStatus[tid] = Running;
 
-            // Go to the next instruction.
-            ++numInst;
+        return false;
+    }
 
-            ++renameSquashedInsts;
+    if (renameStatus[tid] == BarrierStall) {
+        // Stall ends once the ROB is free.
+        DPRINTF(Rename, "[tid:%u]: Done with barrier stall, switching to "
+                "unblocking.\n", tid);
 
-            // Decrement how many instructions are available.
-            --insts_available;
+        DynInstPtr barr_inst = barrierInst[tid];
 
-            continue;
-        }
+        renameStatus[tid] = Unblocking;
 
-        DPRINTF(Rename, "Rename: Processing instruction %i with PC %#x.\n",
-                inst->seqNum, inst->readPC());
-
-        // If it's a trap instruction, then it needs to wait here within
-        // rename until the ROB is empty.  Needs a way to detect that the
-        // ROB is empty.  Maybe an event?
-        // Would be nice if it could be avoided putting this into a
-        // specific stage and instead just put it into the AlphaFullCPU.
-        // Might not really be feasible though...
-        // (EXCB, TRAPB)
-        if (inst->isSerializing()) {
-            panic("Rename: Serializing instruction encountered.\n");
-            DPRINTF(Rename, "Rename: Serializing instruction "
-                            "encountered.\n");
+        unblock(tid);
 
-            // Change status over to BarrierStall so that other stages know
-            // what this is blocked on.
-            _status = BarrierStall;
+        DPRINTF(Rename, "[tid:%u]: Processing instruction [%lli] with "
+                "PC %#x.\n",
+                tid, barr_inst->seqNum, barr_inst->readPC());
 
-            block_this_cycle = true;
+        // Put instruction into queue here.
+        barr_inst->clearSerializeBefore();
 
-            break;
+        if (!skidBuffer[tid].empty()) {
+            skidBuffer[tid].push_front(barr_inst);
+        } else {
+            insts[tid].push_front(barr_inst);
         }
 
-        // Check here to make sure there are enough destination registers
-        // to rename to.  Otherwise block.
-        if (renameMap->numFreeEntries() < inst->numDestRegs())
-        {
-            DPRINTF(Rename, "Rename: Blocking due to lack of free "
-                            "physical registers to rename to.\n");
-            // Need some sort of event based on a register being freed.
-
-            block_this_cycle = true;
+        DPRINTF(Rename, "[tid:%u]: Instruction must be processed by rename."
+                " Adding to front of list.", tid);
 
-            ++renameFullRegistersEvents;
+        barrierInst[tid] = NULL;
 
-            break;
-        }
+        return true;
+    }
 
-        renameSrcRegs(inst);
+    // If we've reached this point, we have not gotten any signals that
+    // cause rename to change its status.  Rename remains the same as before.
+    return false;
+}
 
-        renameDestRegs(inst);
+template<class Impl>
+void
+DefaultRename<Impl>::serializeAfter(InstQueue &inst_list,
+                                   unsigned tid)
+{
+    if (inst_list.empty()) {
+        // Mark a bit to say that I must serialize on the next instruction.
+        serializeOnNextInst[tid] = true;
+        return;
+    }
 
-        // Put instruction in rename queue.
-        toIEW->insts[to_iew_index] = inst;
-        ++(toIEW->size);
+    // Set the next instruction as serializing.
+    inst_list.front()->setSerializeBefore();
+}
 
-        // Decrease the number of free ROB and IQ entries.
-        --free_rob_entries;
-        --free_iq_entries;
+template <class Impl>
+inline void
+DefaultRename<Impl>::incrFullStat(const FullSource &source)
+{
+    switch (source) {
+      case ROB:
+        ++renameROBFullEvents;
+        break;
+      case IQ:
+        ++renameIQFullEvents;
+        break;
+      case LSQ:
+        ++renameLSQFullEvents;
+        break;
+      default:
+        panic("Rename full stall stat should be incremented for a reason!");
+        break;
+    }
+}
 
-        // Increment which instruction we're on.
-        ++to_iew_index;
-        ++numInst;
+template <class Impl>
+void
+DefaultRename<Impl>::dumpHistory()
+{
+    typename list<RenameHistory>::iterator buf_it;
 
-        ++renameRenamedInsts;
+    for (int i = 0; i < numThreads; i++) {
 
-        // Decrement how many instructions are available.
-        --insts_available;
-    }
+        buf_it = historyBuffer[i].begin();
 
-    // Check if there's any instructions left that haven't yet been renamed.
-    // If so then block.
-    if (block_this_cycle) {
-        block();
+        while (buf_it != historyBuffer[i].end()) {
+            cprintf("Seq num: %i\nArch reg: %i New phys reg: %i Old phys "
+                    "reg: %i\n", (*buf_it).instSeqNum, (int)(*buf_it).archReg,
+                    (int)(*buf_it).newPhysReg, (int)(*buf_it).prevPhysReg);
 
-        toDecode->renameInfo.stall = true;
-    } else {
-        // If we had a successful rename and didn't have to exit early, then
-        // reset numInst so it will refer to the correct instruction on next
-        // run.
-        numInst = 0;
+            buf_it++;
+        }
     }
 }
diff --git a/cpu/o3/rename_map.cc b/cpu/o3/rename_map.cc
index 10963f7de..8ba632e65 100644
--- a/cpu/o3/rename_map.cc
+++ b/cpu/o3/rename_map.cc
@@ -39,98 +39,94 @@ using namespace std;
 // determine if the register is a logical int, logical fp, physical int,
 // physical fp, etc.
 
-SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
-                                 unsigned _numPhysicalIntRegs,
-                                 unsigned _numLogicalFloatRegs,
-                                 unsigned _numPhysicalFloatRegs,
-                                 unsigned _numMiscRegs,
-                                 RegIndex _intZeroReg,
-                                 RegIndex _floatZeroReg)
-    : numLogicalIntRegs(_numLogicalIntRegs),
-      numPhysicalIntRegs(_numPhysicalIntRegs),
-      numLogicalFloatRegs(_numLogicalFloatRegs),
-      numPhysicalFloatRegs(_numPhysicalFloatRegs),
-      numMiscRegs(_numMiscRegs),
-      intZeroReg(_intZeroReg),
-      floatZeroReg(_floatZeroReg)
+SimpleRenameMap::~SimpleRenameMap()
+{
+    // Delete the rename maps as they were allocated with new.
+    //delete [] intRenameMap;
+    //delete [] floatRenameMap;
+}
+
+void
+SimpleRenameMap::init(unsigned _numLogicalIntRegs,
+                      unsigned _numPhysicalIntRegs,
+                      PhysRegIndex &ireg_idx,
+
+                      unsigned _numLogicalFloatRegs,
+                      unsigned _numPhysicalFloatRegs,
+                      PhysRegIndex &freg_idx,
+
+                      unsigned _numMiscRegs,
+
+                      RegIndex _intZeroReg,
+                      RegIndex _floatZeroReg,
+
+                      int map_id,
+                      bool bindRegs)
 {
-    DPRINTF(Rename, "Rename: Creating rename map.  Phys: %i / %i, Float: "
-            "%i / %i.\n", numLogicalIntRegs, numPhysicalIntRegs,
+    id = map_id;
+
+    numLogicalIntRegs = _numLogicalIntRegs;
+
+    numLogicalFloatRegs = _numLogicalFloatRegs;
+
+    numPhysicalIntRegs = _numPhysicalIntRegs;
+
+    numPhysicalFloatRegs = _numPhysicalFloatRegs;
+
+    numMiscRegs = _numMiscRegs;
+
+    intZeroReg = _intZeroReg;
+    floatZeroReg = _floatZeroReg;
+
+    DPRINTF(Rename, "Creating rename map %i.  Phys: %i / %i, Float: "
+            "%i / %i.\n", id, numLogicalIntRegs, numPhysicalIntRegs,
             numLogicalFloatRegs, numPhysicalFloatRegs);
 
     numLogicalRegs = numLogicalIntRegs + numLogicalFloatRegs;
 
     numPhysicalRegs = numPhysicalIntRegs + numPhysicalFloatRegs;
 
-    //Create the rename maps, and their scoreboards.
-    intRenameMap = new RenameEntry[numLogicalIntRegs];
-    floatRenameMap = new RenameEntry[numLogicalRegs];
-
-    // Should combine this into one scoreboard.
-    intScoreboard.resize(numPhysicalIntRegs);
-    floatScoreboard.resize(numPhysicalRegs);
-    miscScoreboard.resize(numPhysicalRegs + numMiscRegs);
-
-    // Initialize the entries in the integer rename map to point to the
-    // physical registers of the same index, and consider each register
-    // ready until the first rename occurs.
-    for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
-    {
-        intRenameMap[index].physical_reg = index;
-        intScoreboard[index] = 1;
-    }
+    //Create the rename maps
+    intRenameMap.resize(numLogicalIntRegs);
+    floatRenameMap.resize(numLogicalRegs);
 
-    // Initialize the rest of the physical registers (the ones that don't
-    // directly map to a logical register) as unready.
-    for (PhysRegIndex index = numLogicalIntRegs;
-         index < numPhysicalIntRegs;
-         ++index)
-    {
-        intScoreboard[index] = 0;
-    }
+    if (bindRegs) {
+        DPRINTF(Rename, "Binding registers into rename map %i",id);
 
-    int float_reg_idx = numPhysicalIntRegs;
-
-    // Initialize the entries in the floating point rename map to point to
-    // the physical registers of the same index, and consider each register
-    // ready until the first rename occurs.
-    // Although the index refers purely to architected registers, because
-    // the floating reg indices come after the integer reg indices, they
-    // may exceed the size of a normal RegIndex (short).
-    for (PhysRegIndex index = numLogicalIntRegs;
-         index < numLogicalRegs; ++index)
-    {
-        floatRenameMap[index].physical_reg = float_reg_idx++;
-    }
+        // Initialize the entries in the integer rename map to point to the
+        // physical registers of the same index
+        for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
+        {
+            intRenameMap[index].physical_reg = ireg_idx++;
+        }
 
-    for (PhysRegIndex index = numPhysicalIntRegs;
-         index < numPhysicalIntRegs + numLogicalFloatRegs; ++index)
-    {
-        floatScoreboard[index] = 1;
-    }
+        // Initialize the entries in the floating point rename map to point to
+        // the physical registers of the same index
+        // Although the index refers purely to architected registers, because
+        // the floating reg indices come after the integer reg indices, they
+        // may exceed the size of a normal RegIndex (short).
+        for (PhysRegIndex index = numLogicalIntRegs; index < numLogicalRegs; ++index)
+        {
+            floatRenameMap[index].physical_reg = freg_idx++;
+        }
+    } else {
+        DPRINTF(Rename, "Binding registers into rename map %i",id);
 
-    // Initialize the rest of the physical registers (the ones that don't
-    // directly map to a logical register) as unready.
-    for (PhysRegIndex index = numPhysicalIntRegs + numLogicalFloatRegs;
-         index < numPhysicalRegs;
-         ++index)
-    {
-        floatScoreboard[index] = 0;
-    }
+        PhysRegIndex temp_ireg = ireg_idx;
 
-    // Initialize the entries in the misc register scoreboard to be ready.
-    for (PhysRegIndex index = numPhysicalRegs;
-         index < numPhysicalRegs + numMiscRegs; ++index)
-    {
-        miscScoreboard[index] = 1;
-    }
-}
+        for (RegIndex index = 0; index < numLogicalIntRegs; ++index)
+        {
+            intRenameMap[index].physical_reg = temp_ireg++;
+        }
 
-SimpleRenameMap::~SimpleRenameMap()
-{
-    // Delete the rename maps as they were allocated with new.
-    delete [] intRenameMap;
-    delete [] floatRenameMap;
+        PhysRegIndex temp_freg = freg_idx;
+
+        for (PhysRegIndex index = numLogicalIntRegs;
+             index < numLogicalRegs; ++index)
+        {
+            floatRenameMap[index].physical_reg = temp_freg++;
+        }
+    }
 }
 
 void
@@ -167,8 +163,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
 
             assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
 
-            // Mark register as not ready.
-            intScoreboard[renamed_reg] = false;
         } else {
             // Otherwise return the zero register so nothing bad happens.
             renamed_reg = intZeroReg;
@@ -192,9 +186,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
 
             assert(renamed_reg < numPhysicalRegs &&
                    renamed_reg >= numPhysicalIntRegs);
-
-            // Mark register as not ready.
-            floatScoreboard[renamed_reg] = false;
         } else {
             // Otherwise return the zero register so nothing bad happens.
             renamed_reg = floatZeroReg;
@@ -215,8 +206,6 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         prev_reg = renamed_reg;
 
         assert(renamed_reg < numPhysicalRegs + numMiscRegs);
-
-        miscScoreboard[renamed_reg] = false;
     }
 
     return RenameInfo(renamed_reg, prev_reg);
@@ -244,25 +233,6 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
     }
 }
 
-bool
-SimpleRenameMap::isReady(PhysRegIndex phys_reg)
-{
-    if (phys_reg < numPhysicalIntRegs) {
-        return intScoreboard[phys_reg];
-    } else if (phys_reg < numPhysicalRegs) {
-
-        // Subtract off the base FP offset.
-//        phys_reg = phys_reg - numPhysicalIntRegs;
-
-        return floatScoreboard[phys_reg];
-    } else {
-        // Subtract off the misc registers offset.
-//        phys_reg = phys_reg - numPhysicalRegs;
-
-        return miscScoreboard[phys_reg];
-    }
-}
-
 // In this implementation the miscellaneous registers do not actually rename,
 // so this function does not allow you to try to change their mappings.
 void
@@ -273,14 +243,16 @@ SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
                 (int)arch_reg, renamed_reg);
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
-    } else {
-        assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
+    } else if (arch_reg < numLogicalIntRegs + numLogicalFloatRegs) {
+
 
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
                 (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
+
+    //assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
 }
 
 void
@@ -308,30 +280,6 @@ SimpleRenameMap::squash(vector<RegIndex> freed_regs,
     // Take unmap info and roll back the rename map.
 }
 
-void
-SimpleRenameMap::markAsReady(PhysRegIndex ready_reg)
-{
-    DPRINTF(Rename, "Rename map: Marking register %i as ready.\n",
-            (int)ready_reg);
-
-    if (ready_reg < numPhysicalIntRegs) {
-        assert(ready_reg >= 0);
-
-        intScoreboard[ready_reg] = 1;
-    } else if (ready_reg < numPhysicalRegs) {
-
-        // Subtract off the base FP offset.
-//        ready_reg = ready_reg - numPhysicalIntRegs;
-
-        floatScoreboard[ready_reg] = 1;
-    } else {
-        //Subtract off the misc registers offset.
-//        ready_reg = ready_reg - numPhysicalRegs;
-
-        miscScoreboard[ready_reg] = 1;
-    }
-}
-
 int
 SimpleRenameMap::numFreeEntries()
 {
diff --git a/cpu/o3/rename_map.hh b/cpu/o3/rename_map.hh
index 57be4a64a..3ecbe45c3 100644
--- a/cpu/o3/rename_map.hh
+++ b/cpu/o3/rename_map.hh
@@ -30,8 +30,8 @@
 // Have it so that there's a more meaningful name given to the variable
 // that marks the beginning of the FP registers.
 
-#ifndef __CPU_O3_CPU_RENAME_MAP_HH__
-#define __CPU_O3_CPU_RENAME_MAP_HH__
+#ifndef __CPU_O3_RENAME_MAP_HH__
+#define __CPU_O3_RENAME_MAP_HH__
 
 #include <iostream>
 #include <utility>
@@ -63,17 +63,27 @@ class SimpleRenameMap
 
   public:
     //Constructor
-    SimpleRenameMap(unsigned _numLogicalIntRegs,
-                    unsigned _numPhysicalIntRegs,
-                    unsigned _numLogicalFloatRegs,
-                    unsigned _numPhysicalFloatRegs,
-                    unsigned _numMiscRegs,
-                    RegIndex _intZeroReg,
-                    RegIndex _floatZeroReg);
+     SimpleRenameMap() {};
 
     /** Destructor. */
     ~SimpleRenameMap();
 
+    void init(unsigned _numLogicalIntRegs,
+              unsigned _numPhysicalIntRegs,
+              PhysRegIndex &_int_reg_start,
+
+              unsigned _numLogicalFloatRegs,
+              unsigned _numPhysicalFloatRegs,
+              PhysRegIndex &_float_reg_start,
+
+              unsigned _numMiscRegs,
+
+              RegIndex _intZeroReg,
+              RegIndex _floatZeroReg,
+
+              int id,
+              bool bindRegs);
+
     void setFreeList(SimpleFreeList *fl_ptr);
 
     //Tell rename map to get a free physical register for a given
@@ -84,15 +94,11 @@ class SimpleRenameMap
 
     PhysRegIndex lookup(RegIndex phys_reg);
 
-    bool isReady(PhysRegIndex arch_reg);
-
     /**
      * Marks the given register as ready, meaning that its value has been
      * calculated and written to the register file.
      * @param ready_reg The index of the physical register that is now ready.
      */
-    void markAsReady(PhysRegIndex ready_reg);
-
     void setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg);
 
     void squash(std::vector<RegIndex> freed_regs,
@@ -101,6 +107,9 @@ class SimpleRenameMap
     int numFreeEntries();
 
   private:
+    /** Rename Map ID  */
+    int id;
+
     /** Number of logical integer registers. */
     int numLogicalIntRegs;
 
@@ -143,31 +152,17 @@ class SimpleRenameMap
         { }
     };
 
+    //Change this to private
+  public:
     /** Integer rename map. */
-    RenameEntry *intRenameMap;
+    std::vector<RenameEntry> intRenameMap;
 
     /** Floating point rename map. */
-    RenameEntry *floatRenameMap;
+    std::vector<RenameEntry> floatRenameMap;
 
+  private:
     /** Free list interface. */
     SimpleFreeList *freeList;
-
-    // Might want to make all these scoreboards into one large scoreboard.
-
-    /** Scoreboard of physical integer registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> intScoreboard;
-
-    /** Scoreboard of physical floating registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> floatScoreboard;
-
-    /** Scoreboard of miscellaneous registers, saying whether or not they
-     *  are ready.
-     */
-    std::vector<bool> miscScoreboard;
 };
 
-#endif //__CPU_O3_CPU_RENAME_MAP_HH__
+#endif //__CPU_O3_RENAME_MAP_HH__
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 1185564ad..48199915f 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -26,23 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo: Probably add in support for scheduling events (more than one as
-// well) on the case of the ROB being empty or full.  Considering tracking
-// free entries instead of insts in ROB.  Differentiate between squashing
-// all instructions after the instruction, and all instructions after *and*
-// including that instruction.
-
-#ifndef __CPU_O3_CPU_ROB_HH__
-#define __CPU_O3_CPU_ROB_HH__
+#ifndef __CPU_O3_ROB_HH__
+#define __CPU_O3_ROB_HH__
 
+#include <string>
 #include <utility>
 #include <vector>
 
 /**
- * ROB class.  Uses the instruction list that exists within the CPU to
- * represent the ROB.  This class doesn't contain that list, but instead
- * a pointer to the CPU to get access to the list.  The ROB, in this first
- * implementation, is largely what drives squashing.
+ * ROB class.  The ROB is largely what drives squashing.
  */
 template <class Impl>
 class ROB
@@ -54,16 +46,45 @@ class ROB
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::DynInstPtr DynInstPtr;
 
-    typedef std::pair<RegIndex, PhysRegIndex> UnmapInfo_t;
-    typedef typename list<DynInstPtr>::iterator InstIt_t;
+    typedef std::pair<RegIndex, PhysRegIndex> UnmapInfo;
+    typedef typename std::list<DynInstPtr>::iterator InstIt;
+
+    /** Possible ROB statuses. */
+    enum Status {
+        Running,
+        Idle,
+        ROBSquashing,
+        DcacheMissStall,
+        DcacheMissComplete
+    };
+
+    /** SMT ROB Sharing Policy */
+    enum ROBPolicy{
+        Dynamic,
+        Partitioned,
+        Threshold
+    };
+
+  private:
+    /** Per-thread ROB status. */
+    Status robStatus[Impl::MaxThreads];
+
+    /** ROB resource sharing policy for SMT mode. */
+    ROBPolicy robPolicy;
 
   public:
     /** ROB constructor.
-     *  @param _numEntries Number of entries in ROB.
-     *  @param _squashWidth Number of instructions that can be squashed in a
-     *                       single cycle.
+     *  @param _numEntries      Number of entries in ROB.
+     *  @param _squashWidth     Number of instructions that can be squashed in a
+     *                          single cycle.
+     *  @param _smtROBPolicy    ROB Partitioning Scheme for SMT.
+     *  @param _smtROBThreshold Max Resources(by %) a thread can have in the ROB.
+     *  @param _numThreads      The number of active threads.
      */
-    ROB(unsigned _numEntries, unsigned _squashWidth);
+    ROB(unsigned _numEntries, unsigned _squashWidth, std::string smtROBPolicy,
+        unsigned _smtROBThreshold, unsigned _numThreads);
+
+    std::string name() const;
 
     /** Function to set the CPU pointer, necessary due to which object the ROB
      *  is created within.
@@ -71,12 +92,15 @@ class ROB
      */
     void setCPU(FullCPU *cpu_ptr);
 
-    /** Function to insert an instruction into the ROB.  The parameter inst is
-     *  not truly required, but is useful for checking correctness.  Note
-     *  that whatever calls this function must ensure that there is enough
-     *  space within the ROB for the new instruction.
+    /** Sets pointer to the list of active threads.
+     *  @param at_ptr Pointer to the list of active threads.
+     */
+    void setActiveThreads(std::list<unsigned>* at_ptr);
+
+    /** Function to insert an instruction into the ROB. Note that whatever
+     *  calls this function must ensure that there is enough space within the
+     *  ROB for the new instruction.
      *  @param inst The instruction being inserted into the ROB.
-     *  @todo Remove the parameter once correctness is ensured.
      */
     void insertInst(DynInstPtr &inst);
 
@@ -84,40 +108,134 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInstPtr readHeadInst() { return cpu->instList.front(); }
+    DynInstPtr readHeadInst();
 
-    DynInstPtr readTailInst() { return (*tail); }
+    /** Returns a pointer to the head instruction of a specific thread within
+     *  the ROB.
+     *  @return Pointer to the DynInst that is at the head of the ROB.
+     */
+    DynInstPtr readHeadInst(unsigned tid);
+
+    /** Returns pointer to the tail instruction within the ROB.  There is
+     *  no guarantee as to the return value if the ROB is empty.
+     *  @retval Pointer to the DynInst that is at the tail of the ROB.
+     */
+    DynInstPtr readTailInst();
+
+    /** Returns a pointer to the tail instruction of a specific thread within
+     *  the ROB.
+     *  @return Pointer to the DynInst that is at the tail of the ROB.
+     */
+    DynInstPtr readTailInst(unsigned tid);
 
+    /** Retires the head instruction, removing it from the ROB. */
     void retireHead();
 
+    /** Retires the head instruction of a specific thread, removing it from the
+     *  ROB.
+     */
+    void retireHead(unsigned tid);
+
+    /** Is the oldest instruction across all threads ready. */
     bool isHeadReady();
 
+    /** Is the oldest instruction across a particular thread ready. */
+    bool isHeadReady(unsigned tid);
+
+    /** Is there any commitable head instruction across all threads ready. */
+    bool canCommit();
+
+    /** Re-adjust ROB partitioning. */
+    void resetEntries();
+
+    /** Number of entries needed For 'num_threads' amount of threads. */
+    int entryAmount(int num_threads);
+
+    /** Returns the number of total free entries in the ROB. */
     unsigned numFreeEntries();
 
+    /** Returns the number of free entries in a specific ROB paritition. */
+    unsigned numFreeEntries(unsigned tid);
+
+    /** Returns the maximum number of entries for a specific thread. */
+    unsigned getMaxEntries(unsigned tid)
+    { return maxEntries[tid]; }
+
+    /** Returns the number of entries being used by a specific thread. */
+    unsigned getThreadEntries(unsigned tid)
+    { return threadEntries[tid]; }
+
+    /** Returns if the ROB is full. */
     bool isFull()
     { return numInstsInROB == numEntries; }
 
+    /** Returns if a specific thread's partition is full. */
+    bool isFull(unsigned tid)
+    { return threadEntries[tid] == numEntries; }
+
+    /** Returns if the ROB is empty. */
     bool isEmpty()
     { return numInstsInROB == 0; }
 
-    void doSquash();
+    /** Returns if a specific thread's partition is empty. */
+    bool isEmpty(unsigned tid)
+    { return threadEntries[tid] == 0; }
+
+    /** Executes the squash, marking squashed instructions. */
+    void doSquash(unsigned tid);
+
+    /** Squashes all instructions younger than the given sequence number for
+     *  the specific thread.
+     */
+    void squash(InstSeqNum squash_num, unsigned tid);
 
-    void squash(InstSeqNum squash_num);
+    /** Updates the head instruction with the new oldest instruction. */
+    void updateHead();
 
+    /** Updates the tail instruction with the new youngest instruction. */
+    void updateTail();
+
+    /** Reads the PC of the oldest head instruction. */
     uint64_t readHeadPC();
 
+    /** Reads the PC of the head instruction of a specific thread. */
+    uint64_t readHeadPC(unsigned tid);
+
+    /** Reads the next PC of the oldest head instruction. */
     uint64_t readHeadNextPC();
 
+    /** Reads the next PC of the head instruction of a specific thread. */
+    uint64_t readHeadNextPC(unsigned tid);
+
+    /** Reads the sequence number of the oldest head instruction. */
     InstSeqNum readHeadSeqNum();
 
+    /** Reads the sequence number of the head instruction of a specific thread.
+     */
+    InstSeqNum readHeadSeqNum(unsigned tid);
+
+    /** Reads the PC of the youngest tail instruction. */
     uint64_t readTailPC();
 
+    /** Reads the PC of the tail instruction of a specific thread. */
+    uint64_t readTailPC(unsigned tid);
+
+    /** Reads the sequence number of the youngest tail instruction. */
     InstSeqNum readTailSeqNum();
 
+    /** Reads the sequence number of tail instruction of a specific thread. */
+    InstSeqNum readTailSeqNum(unsigned tid);
+
     /** Checks if the ROB is still in the process of squashing instructions.
      *  @retval Whether or not the ROB is done squashing.
      */
-    bool isDoneSquashing() const { return doneSquashing; }
+    bool isDoneSquashing(unsigned tid) const
+    { return doneSquashing[tid]; }
+
+    /** Checks if the ROB is still in the process of squashing instructions for
+     *  any thread.
+     */
+    bool isDoneSquashing();
 
     /** This is more of a debugging function than anything.  Use
      *  numInstsInROB to get the instructions in the ROB unless you are
@@ -125,23 +243,46 @@ class ROB
      */
     int countInsts();
 
-  private:
+    /** This is more of a debugging function than anything.  Use
+     *  threadEntries to get the instructions in the ROB unless you are
+     *  double checking that variable.
+     */
+    int countInsts(unsigned tid);
 
+  private:
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** Active Threads in CPU */
+    std::list<unsigned>* activeThreads;
+
     /** Number of instructions in the ROB. */
     unsigned numEntries;
 
+    /** Entries Per Thread */
+    unsigned threadEntries[Impl::MaxThreads];
+
+    /** Max Insts a Thread Can Have in the ROB */
+    unsigned maxEntries[Impl::MaxThreads];
+
+    /** ROB List of Instructions */
+    std::list<DynInstPtr> instList[Impl::MaxThreads];
+
     /** Number of instructions that can be squashed in a single cycle. */
     unsigned squashWidth;
 
+  public:
     /** Iterator pointing to the instruction which is the last instruction
      *  in the ROB.  This may at times be invalid (ie when the ROB is empty),
      *  however it should never be incorrect.
      */
-    InstIt_t tail;
+    InstIt tail;
 
+    /** Iterator pointing to the instruction which is the first instruction in
+     *  in the ROB*/
+    InstIt head;
+
+  private:
     /** Iterator used for walking through the list of instructions when
      *  squashing.  Used so that there is persistent state between cycles;
      *  when squashing, the instructions are marked as squashed but not
@@ -149,16 +290,23 @@ class ROB
      *  and after a squash.
      *  This will always be set to cpu->instList.end() if it is invalid.
      */
-    InstIt_t squashIt;
+    InstIt squashIt[Impl::MaxThreads];
 
+  public:
     /** Number of instructions in the ROB. */
     int numInstsInROB;
 
+    DynInstPtr dummyInst;
+
+  private:
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum;
 
     /** Is the ROB done squashing. */
-    bool doneSquashing;
+    bool doneSquashing[Impl::MaxThreads];
+
+    /** Number of active threads. */
+    unsigned numThreads;
 };
 
-#endif //__CPU_O3_CPU_ROB_HH__
+#endif //__CPU_O3_ROB_HH__
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index e7a5671d9..96d907cda 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -26,20 +26,74 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_ROB_IMPL_HH__
-#define __CPU_O3_CPU_ROB_IMPL_HH__
-
 #include "config/full_system.hh"
 #include "cpu/o3/rob.hh"
 
+using namespace std;
+
 template <class Impl>
-ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth)
+ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth,
+               string _smtROBPolicy, unsigned _smtROBThreshold,
+               unsigned _numThreads)
     : numEntries(_numEntries),
       squashWidth(_squashWidth),
       numInstsInROB(0),
-      squashedSeqNum(0)
+      squashedSeqNum(0),
+      numThreads(_numThreads)
 {
-    doneSquashing = true;
+    for (int tid=0; tid  < numThreads; tid++) {
+        doneSquashing[tid] = true;
+        threadEntries[tid] = 0;
+    }
+
+    string policy = _smtROBPolicy;
+
+    //Convert string to lowercase
+    std::transform(policy.begin(), policy.end(), policy.begin(),
+                   (int(*)(int)) tolower);
+
+    //Figure out rob policy
+    if (policy == "dynamic") {
+        robPolicy = Dynamic;
+
+        //Set Max Entries to Total ROB Capacity
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=numEntries;
+        }
+
+    } else if (policy == "partitioned") {
+        robPolicy = Partitioned;
+        DPRINTF(Fetch, "ROB sharing policy set to Partitioned\n");
+
+        //@todo:make work if part_amt doesnt divide evenly.
+        int part_amt = numEntries / numThreads;
+
+        //Divide ROB up evenly
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=part_amt;
+        }
+
+    } else if (policy == "threshold") {
+        robPolicy = Threshold;
+        DPRINTF(Fetch, "ROB sharing policy set to Threshold\n");
+
+        int threshold =  _smtROBThreshold;;
+
+        //Divide up by threshold amount
+        for (int i = 0; i < numThreads; i++) {
+            maxEntries[i]=threshold;
+        }
+    } else {
+        assert(0 && "Invalid ROB Sharing Policy.Options Are:{Dynamic,"
+                    "Partitioned, Threshold}");
+    }
+}
+
+template <class Impl>
+std::string
+ROB<Impl>::name() const
+{
+    return cpu->name() + ".rob";
 }
 
 template <class Impl>
@@ -48,49 +102,74 @@ ROB<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
 
-    // Set the tail to the beginning of the CPU instruction list so that
-    // upon the first instruction being inserted into the ROB, the tail
-    // iterator can simply be incremented.
-    tail = cpu->instList.begin();
+    // Set the per-thread iterators to the end of the instruction list.
+    for (int i=0; i < numThreads;i++) {
+        squashIt[i] = instList[i].end();
+    }
 
-    // Set the squash iterator to the end of the instruction list.
-    squashIt = cpu->instList.end();
+    // Initialize the "universal" ROB head & tail point to invalid
+    // pointers
+    head = instList[0].end();
+    tail = instList[0].end();
 }
 
 template <class Impl>
-int
-ROB<Impl>::countInsts()
+void
+ROB<Impl>::setActiveThreads(list<unsigned> *at_ptr)
 {
-    // Start at 1; if the tail matches cpu->instList.begin(), then there is
-    // one inst in the ROB.
-    int return_val = 1;
+    DPRINTF(ROB, "Setting active threads list pointer.\n");
+    activeThreads = at_ptr;
+}
 
-    // There are quite a few special cases.  Do not use this function other
-    // than for debugging purposes.
-    if (cpu->instList.begin() == cpu->instList.end()) {
-        // In this case there are no instructions in the list.  The ROB
-        // must be empty.
-        return 0;
-    } else if (tail == cpu->instList.end()) {
-        // In this case, the tail is not yet pointing to anything valid.
-        // The ROB must be empty.
-        return 0;
+
+template <class Impl>
+void
+ROB<Impl>::resetEntries()
+{
+    if (robPolicy != Dynamic || numThreads > 1) {
+        int active_threads = (*activeThreads).size();
+
+        list<unsigned>::iterator threads  = (*activeThreads).begin();
+        list<unsigned>::iterator list_end = (*activeThreads).end();
+
+        while (threads != list_end) {
+            if (robPolicy == Partitioned) {
+                maxEntries[*threads++] = numEntries / active_threads;
+            } else if (robPolicy == Threshold && active_threads == 1) {
+                maxEntries[*threads++] = numEntries;
+            }
+        }
     }
+}
 
-    // Iterate through the ROB from the head to the tail, counting the
-    // entries.
-    for (InstIt_t i = cpu->instList.begin(); i != tail; ++i)
-    {
-        assert(i != cpu->instList.end());
-        ++return_val;
+template <class Impl>
+int
+ROB<Impl>::entryAmount(int num_threads)
+{
+    if (robPolicy == Partitioned) {
+        return numEntries / num_threads;
+    } else {
+        return 0;
     }
+}
+
+template <class Impl>
+int
+ROB<Impl>::countInsts()
+{
+    int total=0;
 
-    return return_val;
+    for (int i=0;i < numThreads;i++)
+        total += countInsts(i);
 
-    // Because the head won't be tracked properly until the ROB gets the
-    // first instruction, and any time that the ROB is empty and has not
-    // yet gotten the instruction, this function doesn't work.
-//    return numInstsInROB;
+    return total;
+}
+
+template <class Impl>
+int
+ROB<Impl>::countInsts(unsigned tid)
+{
+    return instList[tid].size();
 }
 
 template <class Impl>
@@ -98,33 +177,42 @@ void
 ROB<Impl>::insertInst(DynInstPtr &inst)
 {
     // Make sure we have the right number of instructions.
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
+
     // Make sure the instruction is valid.
     assert(inst);
 
-    DPRINTF(ROB, "ROB: Adding inst PC %#x to the ROB.\n", inst->readPC());
+    DPRINTF(ROB, "Adding inst PC %#x to the ROB.\n", inst->readPC());
 
     // If the ROB is full then exit.
     assert(numInstsInROB != numEntries);
 
-    ++numInstsInROB;
+    int tid = inst->threadNumber;
 
-    // Increment the tail iterator, moving it one instruction back.
-    // There is a special case if the ROB was empty prior to this insertion,
-    // in which case the tail will be pointing at instList.end().  If that
-    // happens, then reset the tail to the beginning of the list.
-    if (tail != cpu->instList.end()) {
-        ++tail;
-    } else {
-        tail = cpu->instList.begin();
+    // Place into ROB
+    instList[tid].push_back(inst);
+
+    //Set Up head iterator if this is the 1st instruction in the ROB
+    if (numInstsInROB == 0) {
+        head = instList[tid].begin();
+        assert((*head) == inst);
     }
 
-    // Make sure the tail iterator is actually pointing at the instruction
-    // added.
-    assert((*tail) == inst);
+    //Must Decrement for iterator to actually be valid  since __.end()
+    //actually points to 1 after the last inst
+    tail = instList[tid].end();
+    tail--;
+
+    // Mark as set in ROB
+    inst->setInROB();
 
-    DPRINTF(ROB, "ROB: Now has %d instructions.\n", numInstsInROB);
+    // Increment ROB count
+    ++numInstsInROB;
+    ++threadEntries[tid];
 
+    assert((*tail) == inst);
+
+    DPRINTF(ROB, "[tid:%i] Now has %d instructions.\n", tid, threadEntries[tid]);
 }
 
 // Whatever calls this function needs to ensure that it properly frees up
@@ -133,31 +221,55 @@ template <class Impl>
 void
 ROB<Impl>::retireHead()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
+    assert(numInstsInROB > 0);
+
+    // Get the head ROB instruction's TID.
+    int tid = (*head)->threadNumber;
+
+    retireHead(tid);
+
+    if (numInstsInROB == 0) {
+        tail = instList[tid].end();
+    }
+}
+
+template <class Impl>
+void
+ROB<Impl>::retireHead(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
     assert(numInstsInROB > 0);
 
     // Get the head ROB instruction.
-    DynInstPtr head_inst = cpu->instList.front();
+    InstIt head_it = instList[tid].begin();
+
+    DynInstPtr head_inst = (*head_it);
 
     // Make certain this can retire.
     assert(head_inst->readyToCommit());
 
-    DPRINTF(ROB, "ROB: Retiring head instruction of the ROB, "
-            "instruction PC %#x, seq num %i\n", head_inst->readPC(),
+    DPRINTF(ROB, "[tid:%u]: Retiring head instruction, "
+            "instruction PC %#x,[sn:%lli]\n", tid, head_inst->readPC(),
             head_inst->seqNum);
 
     // Keep track of how many instructions are in the ROB.
     --numInstsInROB;
+    --threadEntries[tid];
+
+    //Mark DynInstFlags
+    head_inst->removeInROB();
+    head_inst->setCommitted();
+
+    instList[tid].erase(head_it);
+
+    //Update "Global" Head of ROB
+    updateHead();
 
-    // Tell CPU to remove the instruction from the list of instructions.
     // A special case is needed if the instruction being retired is the
     // only instruction in the ROB; otherwise the tail iterator will become
     // invalidated.
     cpu->removeFrontInst(head_inst);
-
-    if (numInstsInROB == 0) {
-        tail = cpu->instList.end();
-    }
 }
 
 template <class Impl>
@@ -165,7 +277,36 @@ bool
 ROB<Impl>::isHeadReady()
 {
     if (numInstsInROB != 0) {
-        return cpu->instList.front()->readyToCommit();
+        return (*head)->readyToCommit();
+    }
+
+    return false;
+}
+
+template <class Impl>
+bool
+ROB<Impl>::isHeadReady(unsigned tid)
+{
+    if (threadEntries[tid] != 0) {
+        return instList[tid].front()->readyToCommit();
+    }
+
+    return false;
+}
+
+template <class Impl>
+bool
+ROB<Impl>::canCommit()
+{
+    //@todo: set ActiveThreads through ROB or CPU
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (isHeadReady(tid)) {
+            return true;
+        }
     }
 
     return false;
@@ -175,130 +316,339 @@ template <class Impl>
 unsigned
 ROB<Impl>::numFreeEntries()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
     return numEntries - numInstsInROB;
 }
 
+template <class Impl>
+unsigned
+ROB<Impl>::numFreeEntries(unsigned tid)
+{
+    return maxEntries[tid] - threadEntries[tid];
+}
+
 template <class Impl>
 void
-ROB<Impl>::doSquash()
+ROB<Impl>::doSquash(unsigned tid)
 {
-    DPRINTF(ROB, "ROB: Squashing instructions.\n");
+    DPRINTF(ROB, "[tid:%u]: Squashing instructions until [sn:%i].\n",
+            tid, squashedSeqNum);
+
+    assert(squashIt[tid] != instList[tid].end());
+
+    if ((*squashIt[tid])->seqNum < squashedSeqNum) {
+        DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
+                tid);
 
-    assert(squashIt != cpu->instList.end());
+        squashIt[tid] = instList[tid].end();
+
+        doneSquashing[tid] = true;
+        return;
+    }
+
+    bool robTailUpdate = false;
 
     for (int numSquashed = 0;
-         numSquashed < squashWidth && (*squashIt)->seqNum != squashedSeqNum;
+         numSquashed < squashWidth &&
+         squashIt[tid] != instList[tid].end() &&
+         (*squashIt[tid])->seqNum > squashedSeqNum;
          ++numSquashed)
     {
-        // Ensure that the instruction is younger.
-        assert((*squashIt)->seqNum > squashedSeqNum);
-
-        DPRINTF(ROB, "ROB: Squashing instruction PC %#x, seq num %i.\n",
-                (*squashIt)->readPC(), (*squashIt)->seqNum);
+        DPRINTF(ROB, "[tid:%u]: Squashing instruction PC %#x, seq num %i.\n",
+                (*squashIt[tid])->threadNumber,
+                (*squashIt[tid])->readPC(),
+                (*squashIt[tid])->seqNum);
 
         // Mark the instruction as squashed, and ready to commit so that
         // it can drain out of the pipeline.
-        (*squashIt)->setSquashed();
-
-        (*squashIt)->setCanCommit();
-
-        // Special case for when squashing due to a syscall.  It's possible
-        // that the squash happened after the head instruction was already
-        // committed, meaning that (*squashIt)->seqNum != squashedSeqNum
-        // will never be false.  Normally the squash would never be able
-        // to go past the head of the ROB; in this case it might, so it
-        // must be handled otherwise it will segfault.
-#if !FULL_SYSTEM
-        if (squashIt == cpu->instList.begin()) {
-            DPRINTF(ROB, "ROB: Reached head of instruction list while "
+        (*squashIt[tid])->setSquashed();
+
+        (*squashIt[tid])->setCanCommit();
+
+
+        if (squashIt[tid] == instList[tid].begin()) {
+            DPRINTF(ROB, "Reached head of instruction list while "
                     "squashing.\n");
 
-            squashIt = cpu->instList.end();
+            squashIt[tid] = instList[tid].end();
 
-            doneSquashing = true;
+            doneSquashing[tid] = true;
 
             return;
         }
-#endif
 
-        // Move the tail iterator to the next instruction.
-        squashIt--;
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
+
+        if ((*squashIt[tid]) == (*tail_thread))
+            robTailUpdate = true;
+
+        squashIt[tid]--;
     }
 
 
     // Check if ROB is done squashing.
-    if ((*squashIt)->seqNum == squashedSeqNum) {
-        DPRINTF(ROB, "ROB: Done squashing instructions.\n");
+    if ((*squashIt[tid])->seqNum <= squashedSeqNum) {
+        DPRINTF(ROB, "[tid:%u]: Done squashing instructions.\n",
+                tid);
 
-        squashIt = cpu->instList.end();
+        squashIt[tid] = instList[tid].end();
+
+        doneSquashing[tid] = true;
+    }
 
-        doneSquashing = true;
+    if (robTailUpdate) {
+        updateTail();
     }
 }
 
+
 template <class Impl>
 void
-ROB<Impl>::squash(InstSeqNum squash_num)
+ROB<Impl>::updateHead()
 {
-    DPRINTF(ROB, "ROB: Starting to squash within the ROB.\n");
-    doneSquashing = false;
+    DynInstPtr head_inst;
+    InstSeqNum lowest_num = 0;
+    bool first_valid = true;
+
+    // @todo: set ActiveThreads through ROB or CPU
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned thread_num = *threads++;
+
+        if (instList[thread_num].empty())
+            continue;
+
+        if (first_valid) {
+            head = instList[thread_num].begin();
+            lowest_num = (*head)->seqNum;
+            first_valid = false;
+            continue;
+        }
+
+        InstIt head_thread = instList[thread_num].begin();
+
+        DynInstPtr head_inst = (*head_thread);
+
+        assert(head_inst != 0);
+
+        if (head_inst->seqNum < lowest_num) {
+            head = head_thread;
+            lowest_num = head_inst->seqNum;
+        }
+    }
+
+    if (first_valid) {
+        head = instList[0].end();
+    }
+
+}
+
+template <class Impl>
+void
+ROB<Impl>::updateTail()
+{
+    tail = instList[0].end();
+    bool first_valid = true;
+
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+
+        if (instList[tid].empty()) {
+            continue;
+        }
+
+        // If this is the first valid then assign w/out
+        // comparison
+        if (first_valid) {
+            tail = instList[tid].end();
+            tail--;
+            first_valid = false;
+            continue;
+        }
+
+        // Assign new tail if this thread's tail is younger
+        // than our current "tail high"
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
+
+        if ((*tail_thread)->seqNum > (*tail)->seqNum) {
+            tail = tail_thread;
+        }
+    }
+}
+
+
+template <class Impl>
+void
+ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
+{
+    if (isEmpty()) {
+        DPRINTF(ROB, "Does not need to squash due to being empty "
+                "[sn:%i]\n",
+                squash_num);
+
+        return;
+    }
+
+    DPRINTF(ROB, "Starting to squash within the ROB.\n");
+
+    robStatus[tid] = ROBSquashing;
+
+    doneSquashing[tid] = false;
 
     squashedSeqNum = squash_num;
 
-    assert(tail != cpu->instList.end());
+    if (!instList[tid].empty()) {
+        InstIt tail_thread = instList[tid].end();
+        tail_thread--;
 
-    squashIt = tail;
+        squashIt[tid] = tail_thread;
 
-    doSquash();
+        doSquash(tid);
+    }
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readHeadInst()
+{
+    if (numInstsInROB != 0) {
+        assert((*head)->isInROB()==true);
+        return *head;
+    } else {
+        return dummyInst;
+    }
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readHeadInst(unsigned tid)
+{
+    if (threadEntries[tid] != 0) {
+        InstIt head_thread = instList[tid].begin();
+
+        assert((*head_thread)->isInROB()==true);
+
+        return *head_thread;
+    } else {
+        return dummyInst;
+    }
 }
 
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    DynInstPtr head_inst = cpu->instList.front();
+    DynInstPtr head_inst = *head;
 
     return head_inst->readPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readHeadPC(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
+    InstIt head_thread = instList[tid].begin();
+
+    return (*head_thread)->readPC();
+}
+
+
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadNextPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    DynInstPtr head_inst = cpu->instList.front();
+    DynInstPtr head_inst = *head;
 
     return head_inst->readNextPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readHeadNextPC(unsigned tid)
+{
+    //assert(numInstsInROB == countInsts());
+    InstIt head_thread = instList[tid].begin();
+
+    return (*head_thread)->readNextPC();
+}
+
+
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
 {
-    // Return the last sequence number that has not been squashed.  Other
-    // stages can use it to squash any instructions younger than the current
-    // tail.
-    DynInstPtr head_inst = cpu->instList.front();
+    //assert(numInstsInROB == countInsts());
+    DynInstPtr head_inst = *head;
 
     return head_inst->seqNum;
 }
 
+template <class Impl>
+InstSeqNum
+ROB<Impl>::readHeadSeqNum(unsigned tid)
+{
+    InstIt head_thread = instList[tid].begin();
+
+    return ((*head_thread)->seqNum);
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readTailInst()
+{
+    //assert(numInstsInROB == countInsts());
+    //assert(tail != instList[0].end());
+
+    return (*tail);
+}
+
+template <class Impl>
+typename Impl::DynInstPtr
+ROB<Impl>::readTailInst(unsigned tid)
+{
+    //assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return *tail_thread;
+}
+
+
 template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
 {
-    assert(numInstsInROB == countInsts());
+    //assert(numInstsInROB == countInsts());
 
-    assert(tail != cpu->instList.end());
+    //assert(tail != instList[0].end());
 
     return (*tail)->readPC();
 }
 
+template <class Impl>
+uint64_t
+ROB<Impl>::readTailPC(unsigned tid)
+{
+    //assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return (*tail_thread)->readPC();
+}
+
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readTailSeqNum()
@@ -309,4 +659,18 @@ ROB<Impl>::readTailSeqNum()
     return (*tail)->seqNum;
 }
 
-#endif // __CPU_O3_CPU_ROB_IMPL_HH__
+template <class Impl>
+InstSeqNum
+ROB<Impl>::readTailSeqNum(unsigned tid)
+{
+    // Return the last sequence number that has not been squashed.  Other
+    // stages can use it to squash any instructions younger than the current
+    // tail.
+    //    assert(tail_thread[tid] != instList[tid].end());
+
+    InstIt tail_thread = instList[tid].end();
+    tail_thread--;
+
+    return (*tail_thread)->seqNum;
+}
+
diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
index d20fff650..a6e131483 100644
--- a/cpu/o3/sat_counter.cc
+++ b/cpu/o3/sat_counter.cc
@@ -44,7 +44,7 @@ SatCounter::SatCounter(unsigned bits, unsigned initial_val)
 {
     // Check to make sure initial value doesn't exceed the max counter value.
     if (initial_val > maxVal) {
-        panic("BP: Initial counter value exceeds max size.");
+        fatal("BP: Initial counter value exceeds max size.");
     }
 }
 
@@ -57,7 +57,7 @@ SatCounter::setBits(unsigned bits)
 void
 SatCounter::increment()
 {
-    if(counter < maxVal) {
+    if (counter < maxVal) {
         ++counter;
     }
 }
@@ -65,7 +65,7 @@ SatCounter::increment()
 void
 SatCounter::decrement()
 {
-    if(counter > 0) {
+    if (counter > 0) {
         --counter;
     }
 }
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index b7cfe6423..952f1f86d 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_SAT_COUNTER_HH__
-#define __CPU_O3_CPU_SAT_COUNTER_HH__
+#ifndef __CPU_O3_SAT_COUNTER_HH__
+#define __CPU_O3_SAT_COUNTER_HH__
 
 #include "sim/host.hh"
 
@@ -78,13 +78,11 @@ class SatCounter
      * Read the counter's value.
      */
     const uint8_t read() const
-    {
-        return counter;
-    }
+    { return counter; }
 
   private:
     uint8_t maxVal;
     uint8_t counter;
 };
 
-#endif // __CPU_O3_CPU_SAT_COUNTER_HH__
+#endif // __CPU_O3_SAT_COUNTER_HH__
diff --git a/cpu/o3/scoreboard.cc b/cpu/o3/scoreboard.cc
new file mode 100644
index 000000000..87b0aee94
--- /dev/null
+++ b/cpu/o3/scoreboard.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/o3/scoreboard.hh"
+
+Scoreboard::Scoreboard(unsigned activeThreads,
+                       unsigned _numLogicalIntRegs,
+                       unsigned _numPhysicalIntRegs,
+                       unsigned _numLogicalFloatRegs,
+                       unsigned _numPhysicalFloatRegs,
+                       unsigned _numMiscRegs,
+                       unsigned _zeroRegIdx)
+    : numLogicalIntRegs(_numLogicalIntRegs),
+      numPhysicalIntRegs(_numPhysicalIntRegs),
+      numLogicalFloatRegs(_numLogicalFloatRegs),
+      numPhysicalFloatRegs(_numPhysicalFloatRegs),
+      numMiscRegs(_numMiscRegs),
+      zeroRegIdx(_zeroRegIdx)
+{
+    //Get Register Sizes
+    numLogicalRegs = numLogicalIntRegs  + numLogicalFloatRegs;
+    numPhysicalRegs = numPhysicalIntRegs  + numPhysicalFloatRegs;
+
+    //Resize scoreboard appropriately
+    regScoreBoard.resize(numPhysicalRegs + (numMiscRegs * activeThreads));
+
+    //Initialize values
+    for (int i=0; i < numLogicalIntRegs * activeThreads; i++) {
+        regScoreBoard[i] = 1;
+    }
+
+    for (int i= numPhysicalIntRegs;
+         i < numPhysicalIntRegs + (numLogicalFloatRegs * activeThreads);
+         i++) {
+        regScoreBoard[i] = 1;
+    }
+
+    for (int i = numPhysicalRegs;
+         i < numPhysicalRegs + (numMiscRegs * activeThreads);
+         i++) {
+        regScoreBoard[i] = 1;
+    }
+}
+
+std::string
+Scoreboard::name() const
+{
+    return "cpu.scoreboard";
+}
+
+bool
+Scoreboard::getReg(PhysRegIndex phys_reg)
+{
+    // Always ready if int or fp zero reg.
+    if (phys_reg == zeroRegIdx ||
+        phys_reg == (zeroRegIdx + numPhysicalIntRegs)) {
+        return 1;
+    }
+
+    return regScoreBoard[phys_reg];
+}
+
+void
+Scoreboard::setReg(PhysRegIndex phys_reg)
+{
+    DPRINTF(Scoreboard, "Setting reg %i as ready\n", phys_reg);
+
+    regScoreBoard[phys_reg] = 1;
+}
+
+void
+Scoreboard::unsetReg(PhysRegIndex ready_reg)
+{
+    if (ready_reg == zeroRegIdx ||
+        ready_reg == (zeroRegIdx + numPhysicalIntRegs)) {
+        // Don't do anything if int or fp zero reg.
+    }
+
+    regScoreBoard[ready_reg] = 0;
+}
diff --git a/cpu/o3/scoreboard.hh b/cpu/o3/scoreboard.hh
new file mode 100644
index 000000000..77f2cf157
--- /dev/null
+++ b/cpu/o3/scoreboard.hh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_SCOREBOARD_HH__
+#define __CPU_O3_SCOREBOARD_HH__
+
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "arch/alpha/isa_traits.hh"
+#include "base/trace.hh"
+#include "base/traceflags.hh"
+#include "cpu/o3/comm.hh"
+
+/**
+ * Implements a simple scoreboard to track which registers are ready.
+ * This class assumes that the fp registers start, index wise, right after
+ * the integer registers. The misc. registers start, index wise, right after
+ * the fp registers.
+ * @todo: Fix up handling of the zero register in case the decoder does not
+ * automatically make insts that write the zero register into nops.
+ */
+class Scoreboard
+{
+  public:
+    /** Constructs a scoreboard.
+     *  @param activeThreads The number of active threads.
+     *  @param _numLogicalIntRegs Number of logical integer registers.
+     *  @param _numPhysicalIntRegs Number of physical integer registers.
+     *  @param _numLogicalFloatRegs Number of logical fp registers.
+     *  @param _numPhysicalFloatRegs Number of physical fp registers.
+     *  @param _numMiscRegs Number of miscellaneous registers.
+     *  @param _zeroRegIdx Index of the zero register.
+     */
+    Scoreboard(unsigned activeThreads,
+               unsigned _numLogicalIntRegs,
+               unsigned _numPhysicalIntRegs,
+               unsigned _numLogicalFloatRegs,
+               unsigned _numPhysicalFloatRegs,
+               unsigned _numMiscRegs,
+               unsigned _zeroRegIdx);
+
+    /** Destructor. */
+    ~Scoreboard() {}
+
+    /** Returns the name of the scoreboard. */
+    std::string name() const;
+
+    /** Checks if the register is ready. */
+    bool getReg(PhysRegIndex ready_reg);
+
+    /** Sets the register as ready. */
+    void setReg(PhysRegIndex phys_reg);
+
+    /** Sets the register as not ready. */
+    void unsetReg(PhysRegIndex ready_reg);
+
+  private:
+    /** Scoreboard of physical integer registers, saying whether or not they
+     *  are ready.
+     */
+    std::vector<bool> regScoreBoard;
+
+    /** Number of logical integer registers. */
+    int numLogicalIntRegs;
+
+    /** Number of physical integer registers. */
+    int numPhysicalIntRegs;
+
+    /** Number of logical floating point registers. */
+    int numLogicalFloatRegs;
+
+    /** Number of physical floating point registers. */
+    int numPhysicalFloatRegs;
+
+    /** Number of miscellaneous registers. */
+    int numMiscRegs;
+
+    /** Number of logical integer + float registers. */
+    int numLogicalRegs;
+
+    /** Number of physical integer + float registers. */
+    int numPhysicalRegs;
+
+    /** The logical index of the zero register. */
+    int zeroRegIdx;
+};
+
+#endif
diff --git a/cpu/o3/store_set.cc b/cpu/o3/store_set.cc
index 11023f4a8..a685646f3 100644
--- a/cpu/o3/store_set.cc
+++ b/cpu/o3/store_set.cc
@@ -30,43 +30,76 @@
 #include "cpu/o3/store_set.hh"
 
 StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
-    : SSIT_size(_SSIT_size), LFST_size(_LFST_size)
+    : SSITSize(_SSIT_size), LFSTSize(_LFST_size)
 {
     DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
     DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
-            SSIT_size, LFST_size);
+            SSITSize, LFSTSize);
 
-    SSIT = new SSID[SSIT_size];
+    SSIT.resize(SSITSize);
 
-    validSSIT.resize(SSIT_size);
+    validSSIT.resize(SSITSize);
 
-    for (int i = 0; i < SSIT_size; ++i)
+    for (int i = 0; i < SSITSize; ++i)
         validSSIT[i] = false;
 
-    LFST = new InstSeqNum[LFST_size];
+    LFST.resize(LFSTSize);
 
-    validLFST.resize(LFST_size);
+    validLFST.resize(LFSTSize);
 
-    SSCounters = new int[LFST_size];
+    for (int i = 0; i < LFSTSize; ++i) {
+        validLFST[i] = false;
+        LFST[i] = 0;
+    }
+
+    indexMask = SSITSize - 1;
+
+    offsetBits = 2;
+}
+
+StoreSet::~StoreSet()
+{
+}
+
+void
+StoreSet::init(int _SSIT_size, int _LFST_size)
+{
+    SSITSize = _SSIT_size;
+    LFSTSize = _LFST_size;
+
+    DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
+    DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
+            SSITSize, LFSTSize);
+
+    SSIT.resize(SSITSize);
+
+    validSSIT.resize(SSITSize);
+
+    for (int i = 0; i < SSITSize; ++i)
+        validSSIT[i] = false;
+
+    LFST.resize(LFSTSize);
+
+    validLFST.resize(LFSTSize);
 
-    for (int i = 0; i < LFST_size; ++i)
-    {
+    for (int i = 0; i < LFSTSize; ++i) {
         validLFST[i] = false;
-        SSCounters[i] = 0;
+        LFST[i] = 0;
     }
 
-    index_mask = SSIT_size - 1;
+    indexMask = SSITSize - 1;
 
-    offset_bits = 2;
+    offsetBits = 2;
 }
 
+
 void
 StoreSet::violation(Addr store_PC, Addr load_PC)
 {
     int load_index = calcIndex(load_PC);
     int store_index = calcIndex(store_PC);
 
-    assert(load_index < SSIT_size && store_index < SSIT_size);
+    assert(load_index < SSITSize && store_index < SSITSize);
 
     bool valid_load_SSID = validSSIT[load_index];
     bool valid_store_SSID = validSSIT[store_index];
@@ -83,10 +116,7 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[store_index] = new_set;
 
-        assert(new_set < LFST_size);
-
-        SSCounters[new_set]++;
-
+        assert(new_set < LFSTSize);
 
         DPRINTF(StoreSet, "StoreSet: Neither load nor store had a valid "
                 "storeset, creating a new one: %i for load %#x, store %#x\n",
@@ -98,9 +128,7 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[store_index] = load_SSID;
 
-        assert(load_SSID < LFST_size);
-
-        SSCounters[load_SSID]++;
+        assert(load_SSID < LFSTSize);
 
         DPRINTF(StoreSet, "StoreSet: Load had a valid store set.  Adding "
                 "store to that set: %i for load %#x, store %#x\n",
@@ -112,9 +140,6 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
 
         SSIT[load_index] = store_SSID;
 
-        // Because we are having a load point to an already existing set,
-        // the size of the store set is not incremented.
-
         DPRINTF(StoreSet, "StoreSet: Store had a valid store set: %i for "
                 "load %#x, store %#x\n",
                 store_SSID, load_PC, store_PC);
@@ -122,29 +147,19 @@ StoreSet::violation(Addr store_PC, Addr load_PC)
         SSID load_SSID = SSIT[load_index];
         SSID store_SSID = SSIT[store_index];
 
-        assert(load_SSID < LFST_size && store_SSID < LFST_size);
+        assert(load_SSID < LFSTSize && store_SSID < LFSTSize);
 
-        int load_SS_size = SSCounters[load_SSID];
-        int store_SS_size = SSCounters[store_SSID];
-
-        // If the load has the bigger store set, then assign the store
-        // to the same store set as the load.  Otherwise vice-versa.
-        if (load_SS_size > store_SS_size) {
+        // The store set with the lower number wins
+        if (store_SSID > load_SSID) {
             SSIT[store_index] = load_SSID;
 
-            SSCounters[load_SSID]++;
-            SSCounters[store_SSID]--;
-
-            DPRINTF(StoreSet, "StoreSet: Load had bigger store set: %i; "
+            DPRINTF(StoreSet, "StoreSet: Load had smaller store set: %i; "
                     "for load %#x, store %#x\n",
                     load_SSID, load_PC, store_PC);
         } else {
             SSIT[load_index] = store_SSID;
 
-            SSCounters[store_SSID]++;
-            SSCounters[load_SSID]--;
-
-            DPRINTF(StoreSet, "StoreSet: Store had bigger store set: %i; "
+            DPRINTF(StoreSet, "StoreSet: Store had smaller store set: %i; "
                     "for load %#x, store %#x\n",
                     store_SSID, load_PC, store_PC);
         }
@@ -159,13 +174,14 @@ StoreSet::insertLoad(Addr load_PC, InstSeqNum load_seq_num)
 }
 
 void
-StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
+StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num,
+                      unsigned tid)
 {
     int index = calcIndex(store_PC);
 
     int store_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
 
     if (!validSSIT[index]) {
         // Do nothing if there's no valid entry.
@@ -173,13 +189,15 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
     } else {
         store_SSID = SSIT[index];
 
-        assert(store_SSID < LFST_size);
+        assert(store_SSID < LFSTSize);
 
         // Update the last store that was fetched with the current one.
         LFST[store_SSID] = store_seq_num;
 
         validLFST[store_SSID] = 1;
 
+        storeList[store_seq_num] = store_SSID;
+
         DPRINTF(StoreSet, "Store %#x updated the LFST, SSID: %i\n",
                 store_PC, store_SSID);
     }
@@ -192,7 +210,7 @@ StoreSet::checkInst(Addr PC)
 
     int inst_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
 
     if (!validSSIT[index]) {
         DPRINTF(StoreSet, "Inst %#x with index %i had no SSID\n",
@@ -203,7 +221,7 @@ StoreSet::checkInst(Addr PC)
     } else {
         inst_SSID = SSIT[index];
 
-        assert(inst_SSID < LFST_size);
+        assert(inst_SSID < LFSTSize);
 
         if (!validLFST[inst_SSID]) {
 
@@ -232,7 +250,13 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 
     int store_SSID;
 
-    assert(index < SSIT_size);
+    assert(index < SSITSize);
+
+    SeqNumMapIt store_list_it = storeList.find(issued_seq_num);
+
+    if (store_list_it != storeList.end()) {
+        storeList.erase(store_list_it);
+    }
 
     // Make sure the SSIT still has a valid entry for the issued store.
     if (!validSSIT[index]) {
@@ -241,7 +265,7 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 
     store_SSID = SSIT[index];
 
-    assert(store_SSID < LFST_size);
+    assert(store_SSID < LFSTSize);
 
     // If the last fetched store in the store set refers to the store that
     // was just issued, then invalidate the entry.
@@ -252,18 +276,36 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 }
 
 void
-StoreSet::squash(InstSeqNum squashed_num)
+StoreSet::squash(InstSeqNum squashed_num, unsigned tid)
 {
     // Not really sure how to do this well.
     // Generally this is small enough that it should be okay; short circuit
     // evaluation should take care of invalid entries.
+    // Maybe keep a list of valid LFST's?  Really ugly either way...
 
     DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
             squashed_num);
 
-    for (int i = 0; i < LFST_size; ++i) {
-        if (validLFST[i] && LFST[i] < squashed_num) {
-            validLFST[i] = false;
+    int idx;
+    SeqNumMapIt store_list_it = storeList.begin();
+
+    //@todo:Fix to only delete from correct thread
+    while (!storeList.empty()) {
+        idx = (*store_list_it).second;
+
+        if ((*store_list_it).first <= squashed_num) {
+            break;
+        }
+
+        bool younger = LFST[idx] > squashed_num;
+
+        if (validLFST[idx] && younger) {
+            DPRINTF(StoreSet, "Squashed [sn:%lli]\n", LFST[idx]);
+            validLFST[idx] = false;
+
+            storeList.erase(store_list_it++);
+        } else if (!validLFST[idx] && younger) {
+            storeList.erase(store_list_it++);
         }
     }
 }
@@ -271,12 +313,13 @@ StoreSet::squash(InstSeqNum squashed_num)
 void
 StoreSet::clear()
 {
-    for (int i = 0; i < SSIT_size; ++i) {
+    for (int i = 0; i < SSITSize; ++i) {
         validSSIT[i] = false;
     }
 
-    for (int i = 0; i < LFST_size; ++i) {
+    for (int i = 0; i < LFSTSize; ++i) {
         validLFST[i] = false;
     }
-}
 
+    storeList.clear();
+}
diff --git a/cpu/o3/store_set.hh b/cpu/o3/store_set.hh
index 5a885d838..7189db3ab 100644
--- a/cpu/o3/store_set.hh
+++ b/cpu/o3/store_set.hh
@@ -26,61 +26,80 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_STORE_SET_HH__
-#define __CPU_O3_CPU_STORE_SET_HH__
+#ifndef __CPU_O3_STORE_SET_HH__
+#define __CPU_O3_STORE_SET_HH__
 
+#include <list>
+#include <map>
+#include <utility>
 #include <vector>
 
 #include "arch/isa_traits.hh"
 #include "cpu/inst_seq.hh"
 
+struct ltseqnum {
+    bool operator()(const InstSeqNum &lhs, const InstSeqNum &rhs) const
+    {
+        return lhs > rhs;
+    }
+};
+
 class StoreSet
 {
   public:
     typedef unsigned SSID;
 
   public:
+    StoreSet() { };
+
     StoreSet(int SSIT_size, int LFST_size);
 
+    ~StoreSet();
+
+    void init(int SSIT_size, int LFST_size);
+
     void violation(Addr store_PC, Addr load_PC);
 
     void insertLoad(Addr load_PC, InstSeqNum load_seq_num);
 
-    void insertStore(Addr store_PC, InstSeqNum store_seq_num);
+    void insertStore(Addr store_PC, InstSeqNum store_seq_num,
+                     unsigned tid);
 
     InstSeqNum checkInst(Addr PC);
 
     void issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store);
 
-    void squash(InstSeqNum squashed_num);
+    void squash(InstSeqNum squashed_num, unsigned tid);
 
     void clear();
 
   private:
     inline int calcIndex(Addr PC)
-    { return (PC >> offset_bits) & index_mask; }
+    { return (PC >> offsetBits) & indexMask; }
 
     inline SSID calcSSID(Addr PC)
-    { return ((PC ^ (PC >> 10)) % LFST_size); }
+    { return ((PC ^ (PC >> 10)) % LFSTSize); }
 
-    SSID *SSIT;
+    std::vector<SSID> SSIT;
 
     std::vector<bool> validSSIT;
 
-    InstSeqNum *LFST;
+    std::vector<InstSeqNum> LFST;
 
     std::vector<bool> validLFST;
 
-    int *SSCounters;
+    std::map<InstSeqNum, int, ltseqnum> storeList;
+
+    typedef std::map<InstSeqNum, int, ltseqnum>::iterator SeqNumMapIt;
 
-    int SSIT_size;
+    int SSITSize;
 
-    int LFST_size;
+    int LFSTSize;
 
-    int index_mask;
+    int indexMask;
 
     // HACK: Hardcoded for now.
-    int offset_bits;
+    int offsetBits;
 };
 
-#endif // __CPU_O3_CPU_STORE_SET_HH__
+#endif // __CPU_O3_STORE_SET_HH__
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
new file mode 100644
index 000000000..846f44176
--- /dev/null
+++ b/cpu/o3/thread_state.hh
@@ -0,0 +1,143 @@
+
+#ifndef __CPU_O3_THREAD_STATE_HH__
+#define __CPU_O3_THREAD_STATE_HH__
+
+#include "arch/faults.hh"
+#include "arch/isa_traits.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/thread_state.hh"
+
+class Event;
+class Process;
+
+#if FULL_SYSTEM
+class EndQuiesceEvent;
+class FunctionProfile;
+class ProfileNode;
+#else
+class Process;
+class FunctionalMemory;
+#endif
+
+// In the new CPU case this may be quite small...It depends on what I define
+// ThreadState to be.  Currently it's only the state that exists within
+// ExecContext basically.  Leaves the interface and manipulation up to the
+// CPU.  Not sure this is useful/flexible...probably can be if I can avoid
+// including state here that parts of the pipeline can't modify directly,
+// or at least don't let them.  The only problem is for state that's needed
+// per thread, per structure.  I.e. rename table, memreqs.
+// On the other hand, it might be nice to not have to pay the extra pointer
+// lookup to get frequently used state such as a memreq (that isn't used much
+// elsewhere)...
+
+// Maybe this ozone thread state should only really have committed state?
+// I need to think about why I'm using this and what it's useful for.  Clearly
+// has benefits for SMT; basically serves same use as CPUExecContext.
+// Makes the ExecContext proxy easier.  Gives organization/central access point
+// to state of a thread that can be accessed normally (i.e. not in-flight
+// stuff within a OoO processor).  Does this need an XC proxy within it?
+template <class Impl>
+struct O3ThreadState : public ThreadState {
+    typedef ExecContext::Status Status;
+    typedef typename Impl::FullCPU FullCPU;
+
+    Status _status;
+
+    // Current instruction?
+    TheISA::MachInst inst;
+  private:
+    FullCPU *cpu;
+  public:
+
+    bool inSyscall;
+
+    bool trapPending;
+
+#if FULL_SYSTEM
+    O3ThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem)
+        : ThreadState(-1, _thread_num, _mem),
+          inSyscall(0), trapPending(0)
+    { }
+#else
+    O3ThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
+        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    { }
+
+    O3ThreadState(FullCPU *_cpu, int _thread_num, FunctionalMemory *_mem,
+                  int _asid)
+        : ThreadState(-1, _thread_num, _mem, NULL, _asid),
+          cpu(_cpu), inSyscall(0), trapPending(0)
+    { }
+#endif
+
+    ExecContext *xcProxy;
+
+    ExecContext *getXCProxy() { return xcProxy; }
+
+    Status status() const { return _status; }
+
+    void setStatus(Status new_status) { _status = new_status; }
+
+#if !FULL_SYSTEM
+
+    Fault dummyTranslation(MemReqPtr &req)
+    {
+#if 0
+        assert((req->vaddr >> 48 & 0xffff) == 0);
+#endif
+
+        // put the asid in the upper 16 bits of the paddr
+        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
+        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
+        return NoFault;
+    }
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return dummyTranslation(req);
+    }
+
+    bool validInstAddr(Addr addr)
+    { return process->validInstAddr(addr); }
+
+    bool validDataAddr(Addr addr)
+    { return process->validDataAddr(addr); }
+#else
+    Fault translateInstReq(MemReqPtr &req)
+    {
+        return cpu->itb->translate(req);
+    }
+
+    Fault translateDataReadReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, false);
+    }
+
+    Fault translateDataWriteReq(MemReqPtr &req)
+    {
+        return cpu->dtb->translate(req, true);
+    }
+#endif
+
+    bool misspeculating() { return false; }
+
+    void setInst(TheISA::MachInst _inst) { inst = _inst; }
+
+    Counter readFuncExeInst() { return funcExeInst; }
+
+    void setFuncExeInst(Counter new_val) { funcExeInst = new_val; }
+
+#if !FULL_SYSTEM
+    void syscall() { process->syscall(xcProxy); }
+#endif
+};
+
+#endif // __CPU_O3_THREAD_STATE_HH__
diff --git a/cpu/o3/tournament_pred.cc b/cpu/o3/tournament_pred.cc
index 3fb580510..89da7b9f5 100644
--- a/cpu/o3/tournament_pred.cc
+++ b/cpu/o3/tournament_pred.cc
@@ -28,37 +28,37 @@
 
 #include "cpu/o3/tournament_pred.hh"
 
-TournamentBP::TournamentBP(unsigned _local_predictor_size,
-                           unsigned _local_ctr_bits,
-                           unsigned _local_history_table_size,
-                           unsigned _local_history_bits,
-                           unsigned _global_predictor_size,
-                           unsigned _global_ctr_bits,
-                           unsigned _global_history_bits,
-                           unsigned _choice_predictor_size,
-                           unsigned _choice_ctr_bits,
+TournamentBP::TournamentBP(unsigned _localPredictorSize,
+                           unsigned _localCtrBits,
+                           unsigned _localHistoryTableSize,
+                           unsigned _localHistoryBits,
+                           unsigned _globalPredictorSize,
+                           unsigned _globalCtrBits,
+                           unsigned _globalHistoryBits,
+                           unsigned _choicePredictorSize,
+                           unsigned _choiceCtrBits,
                            unsigned _instShiftAmt)
-    : localPredictorSize(_local_predictor_size),
-      localCtrBits(_local_ctr_bits),
-      localHistoryTableSize(_local_history_table_size),
-      localHistoryBits(_local_history_bits),
-      globalPredictorSize(_global_predictor_size),
-      globalCtrBits(_global_ctr_bits),
-      globalHistoryBits(_global_history_bits),
-      choicePredictorSize(_global_predictor_size),
-      choiceCtrBits(_choice_ctr_bits),
+    : localPredictorSize(_localPredictorSize),
+      localCtrBits(_localCtrBits),
+      localHistoryTableSize(_localHistoryTableSize),
+      localHistoryBits(_localHistoryBits),
+      globalPredictorSize(_globalPredictorSize),
+      globalCtrBits(_globalCtrBits),
+      globalHistoryBits(_globalHistoryBits),
+      choicePredictorSize(_globalPredictorSize),
+      choiceCtrBits(_choiceCtrBits),
       instShiftAmt(_instShiftAmt)
 {
     //Should do checks here to make sure sizes are correct (powers of 2)
 
     //Setup the array of counters for the local predictor
-    localCtrs = new SatCounter[localPredictorSize];
+    localCtrs.resize(localPredictorSize);
 
     for (int i = 0; i < localPredictorSize; ++i)
         localCtrs[i].setBits(localCtrBits);
 
     //Setup the history table for the local table
-    localHistoryTable = new unsigned[localHistoryTableSize];
+    localHistoryTable.resize(localHistoryTableSize);
 
     for (int i = 0; i < localHistoryTableSize; ++i)
         localHistoryTable[i] = 0;
@@ -67,7 +67,7 @@ TournamentBP::TournamentBP(unsigned _local_predictor_size,
     localHistoryMask = (1 << localHistoryBits) - 1;
 
     //Setup the array of counters for the global predictor
-    globalCtrs = new SatCounter[globalPredictorSize];
+    globalCtrs.resize(globalPredictorSize);
 
     for (int i = 0; i < globalPredictorSize; ++i)
         globalCtrs[i].setBits(globalCtrBits);
@@ -78,7 +78,7 @@ TournamentBP::TournamentBP(unsigned _local_predictor_size,
     globalHistoryMask = (1 << globalHistoryBits) - 1;
 
     //Setup the array of counters for the choice predictor
-    choiceCtrs = new SatCounter[choicePredictorSize];
+    choiceCtrs.resize(choicePredictorSize);
 
     for (int i = 0; i < choicePredictorSize; ++i)
         choiceCtrs[i].setBits(choiceCtrBits);
@@ -240,8 +240,7 @@ TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken)
         globalHistory = globalHistory & globalHistoryMask;
 
         localHistoryTable[local_history_idx] |= 1;
-    }
-    else {
+    } else {
         assert(globalHistory < globalPredictorSize &&
                local_predictor_idx < localPredictorSize);
 
diff --git a/cpu/o3/tournament_pred.hh b/cpu/o3/tournament_pred.hh
index cb93c2f67..7b600aa53 100644
--- a/cpu/o3/tournament_pred.hh
+++ b/cpu/o3/tournament_pred.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_CPU_TOURNAMENT_PRED_HH__
-#define __CPU_O3_CPU_TOURNAMENT_PRED_HH__
+#ifndef __CPU_O3_TOURNAMENT_PRED_HH__
+#define __CPU_O3_TOURNAMENT_PRED_HH__
 
 // For Addr type.
 #include "arch/isa_traits.hh"
 #include "cpu/o3/sat_counter.hh"
+#include <vector>
 
 class TournamentBP
 {
@@ -39,15 +40,15 @@ class TournamentBP
     /**
      * Default branch predictor constructor.
      */
-    TournamentBP(unsigned local_predictor_size,
-                 unsigned local_ctr_bits,
-                 unsigned local_history_table_size,
-                 unsigned local_history_bits,
-                 unsigned global_predictor_size,
-                 unsigned global_history_bits,
-                 unsigned global_ctr_bits,
-                 unsigned choice_predictor_size,
-                 unsigned choice_ctr_bits,
+    TournamentBP(unsigned localPredictorSize,
+                 unsigned localCtrBits,
+                 unsigned localHistoryTableSize,
+                 unsigned localHistoryBits,
+                 unsigned globalPredictorSize,
+                 unsigned globalHistoryBits,
+                 unsigned globalCtrBits,
+                 unsigned choicePredictorSize,
+                 unsigned choiceCtrBits,
                  unsigned instShiftAmt);
 
     /**
@@ -78,7 +79,7 @@ class TournamentBP
     inline void updateHistoriesNotTaken(unsigned local_history_idx);
 
     /** Local counters. */
-    SatCounter *localCtrs;
+    std::vector<SatCounter> localCtrs;
 
     /** Size of the local predictor. */
     unsigned localPredictorSize;
@@ -87,7 +88,7 @@ class TournamentBP
     unsigned localCtrBits;
 
     /** Array of local history table entries. */
-    unsigned *localHistoryTable;
+    std::vector<unsigned> localHistoryTable;
 
     /** Size of the local history table. */
     unsigned localHistoryTableSize;
@@ -102,7 +103,7 @@ class TournamentBP
 
 
     /** Array of counters that make up the global predictor. */
-    SatCounter *globalCtrs;
+    std::vector<SatCounter> globalCtrs;
 
     /** Size of the global predictor. */
     unsigned globalPredictorSize;
@@ -121,7 +122,7 @@ class TournamentBP
 
 
     /** Array of counters that make up the choice predictor. */
-    SatCounter *choiceCtrs;
+    std::vector<SatCounter> choiceCtrs;
 
     /** Size of the choice predictor (identical to the global predictor). */
     unsigned choicePredictorSize;
@@ -140,4 +141,4 @@ class TournamentBP
     unsigned threshold;
 };
 
-#endif // __CPU_O3_CPU_TOURNAMENT_PRED_HH__
+#endif // __CPU_O3_TOURNAMENT_PRED_HH__
-- 
cgit v1.2.3


From b14bf0321947419603610f07ed4f14b51a2192a3 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 16:59:50 -0400
Subject: Fixes for ll/sc for the O3 model.

cpu/o3/alpha_cpu.hh:
    Store conditionals should not write their data to memory if they failed.
cpu/o3/lsq_unit.hh:
    Setup request parameters when they're needed.

--HG--
extra : convert_revision : d75cd7deda03584b7e25cb567e4d79032cac7118
---
 cpu/o3/alpha_cpu.hh | 3 ++-
 cpu/o3/lsq_unit.hh  | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 68e149e77..dfdf092ed 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -425,9 +425,10 @@ class AlphaFullCPU : public FullO3CPU<Impl>
                 req->result = 2;
             } else {
                 if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
-                    req->result=1;
+                    req->result = 1;
                 } else {
                     req->result = 0;
+                    return NoFault;
                 }
             }
         }
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index 73c485ce9..ba8b1d2e2 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -566,6 +566,9 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
             loadQueue[load_idx]->readPC());
     assert(!req->data);
+    req->cmd = Read;
+    req->completionEvent = NULL;
+    req->time = curTick;
     req->data = new uint8_t[64];
     Fault fault = cpu->read(req, data);
     memcpy(req->data, &data, sizeof(T));
@@ -587,9 +590,6 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
 
         assert(!req->completionEvent);
         req->completionEvent =
-- 
cgit v1.2.3


From 676afbe2c729575f3468d4ae0aad31c5ac382ab8 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:06:00 -0400
Subject: New stats added to O3 model.

--HG--
extra : convert_revision : 7abb491e89e3e1a331cd19aa05ddce5184abf9e0
---
 cpu/o3/commit.hh          |  19 ++++-
 cpu/o3/commit_impl.hh     | 115 +++++++++++++++++++++++++++++-
 cpu/o3/fetch.hh           |   3 +
 cpu/o3/fetch_impl.hh      |  29 ++++++--
 cpu/o3/iew.hh             |  33 ++++++++-
 cpu/o3/iew_impl.hh        | 175 +++++++++++++++++++++++++++++++++++++++++++---
 cpu/o3/inst_queue.hh      |  17 ++++-
 cpu/o3/inst_queue_impl.hh | 128 +++++++++++++++++++++++++++++++--
 cpu/o3/rename.hh          |  12 ++--
 cpu/o3/rename_impl.hh     | 100 ++++++++++++++------------
 10 files changed, 555 insertions(+), 76 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 93b74ebb0..f374b8fb7 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -369,6 +369,8 @@ class DefaultCommit
     /** Rename map interface. */
     RenameMap *renameMap[Impl::MaxThreads];
 
+    void updateComInstStats(DynInstPtr &inst);
+
     /** Stat for the total number of committed instructions. */
     Stats::Scalar<> commitCommittedInsts;
     /** Stat for the total number of squashed instructions discarded by commit.
@@ -383,15 +385,26 @@ class DefaultCommit
      */
     Stats::Scalar<> commitNonSpecStalls;
     /** Stat for the total number of committed branches. */
-    Stats::Scalar<> commitCommittedBranches;
+//    Stats::Scalar<> commitCommittedBranches;
     /** Stat for the total number of committed loads. */
-    Stats::Scalar<> commitCommittedLoads;
+//    Stats::Scalar<> commitCommittedLoads;
     /** Stat for the total number of committed memory references. */
-    Stats::Scalar<> commitCommittedMemRefs;
+//    Stats::Scalar<> commitCommittedMemRefs;
     /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar<> branchMispredicts;
     /** Distribution of the number of committed instructions each cycle. */
     Stats::Distribution<> numCommittedDist;
+
+    // total number of instructions committed
+    Stats::Vector<> stat_com_inst;
+    Stats::Vector<> stat_com_swp;
+    Stats::Vector<> stat_com_refs;
+    Stats::Vector<> stat_com_loads;
+    Stats::Vector<> stat_com_membars;
+    Stats::Vector<> stat_com_branches;
+
+    Stats::Scalar<> commit_eligible_samples;
+    Stats::Vector<> commit_eligible;
 };
 
 #endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index ef1ba9282..157e688c7 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -133,6 +133,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::regStats()
 {
+    using namespace Stats;
     commitCommittedInsts
         .name(name() + ".commitCommittedInsts")
         .desc("The number of committed instructions")
@@ -150,6 +151,7 @@ DefaultCommit<Impl>::regStats()
         .desc("The number of times commit has been forced to stall to "
               "communicate backwards")
         .prereq(commitNonSpecStalls);
+/*
     commitCommittedBranches
         .name(name() + ".commitCommittedBranches")
         .desc("The number of committed branches")
@@ -162,6 +164,7 @@ DefaultCommit<Impl>::regStats()
         .name(name() + ".commitCommittedMemRefs")
         .desc("The number of committed memory references")
         .prereq(commitCommittedMemRefs);
+*/
     branchMispredicts
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
@@ -172,6 +175,73 @@ DefaultCommit<Impl>::regStats()
         .desc("Number of insts commited each cycle")
         .flags(Stats::pdf)
         ;
+
+    stat_com_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:count")
+        .desc("Number of instructions committed")
+        .flags(total)
+        ;
+
+    stat_com_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:swp_count")
+        .desc("Number of s/w prefetches committed")
+        .flags(total)
+        ;
+
+    stat_com_refs
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:refs")
+        .desc("Number of memory references committed")
+        .flags(total)
+        ;
+
+    stat_com_loads
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:loads")
+        .desc("Number of loads committed")
+        .flags(total)
+        ;
+
+    stat_com_membars
+        .init(cpu->number_of_threads)
+        .name(name() +  ".COM:membars")
+        .desc("Number of memory barriers committed")
+        .flags(total)
+        ;
+
+    stat_com_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:branches")
+        .desc("Number of branches committed")
+        .flags(total)
+        ;
+
+    //
+    //  Commit-Eligible instructions...
+    //
+    //  -> The number of instructions eligible to commit in those
+    //  cycles where we reached our commit BW limit (less the number
+    //  actually committed)
+    //
+    //  -> The average value is computed over ALL CYCLES... not just
+    //  the BW limited cycles
+    //
+    //  -> The standard deviation is computed only over cycles where
+    //  we reached the BW limit
+    //
+    commit_eligible
+        .init(cpu->number_of_threads)
+        .name(name() + ".COM:bw_limited")
+        .desc("number of insts not committed due to BW limits")
+        .flags(total)
+        ;
+
+    commit_eligible_samples
+        .name(name() + ".COM:bw_lim_events")
+        .desc("number cycles where commit BW limit reached")
+        ;
 }
 
 template <class Impl>
@@ -1060,9 +1130,7 @@ head_inst->isWriteBarrier())*/
         return false;
     }
 
-    if (head_inst->isControl()) {
-        ++commitCommittedBranches;
-    }
+    updateComInstStats(head_inst);
 
     // Now that the instruction is going to be committed, finalize its
     // trace data.
@@ -1186,6 +1254,47 @@ DefaultCommit<Impl>::robDoneSquashing()
     return true;
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::updateComInstStats(DynInstPtr &inst)
+{
+    unsigned thread = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch()) {
+        stat_com_swp[thread]++;
+    } else {
+        stat_com_inst[thread]++;
+    }
+#else
+    stat_com_inst[thread]++;
+#endif
+
+    //
+    //  Control Instructions
+    //
+    if (inst->isControl())
+        stat_com_branches[thread]++;
+
+    //
+    //  Memory references
+    //
+    if (inst->isMemRef()) {
+        stat_com_refs[thread]++;
+
+        if (inst->isLoad()) {
+            stat_com_loads[thread]++;
+        }
+    }
+
+    if (inst->isMemBarrier()) {
+        stat_com_membars[thread]++;
+    }
+}
+
 ////////////////////////////////////////
 //                                    //
 //   SMT COMMIT POLICY MAITAINED HERE //
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index f0f3f2745..f0b15cb86 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -370,6 +370,7 @@ class DefaultFetch
     Stats::Scalar<> icacheStallCycles;
     /** Stat for total number of fetched instructions. */
     Stats::Scalar<> fetchedInsts;
+    Stats::Scalar<> fetchedBranches;
     /** Stat for total number of predicted branches. */
     Stats::Scalar<> predictedBranches;
     /** Stat for total number of cycles spent fetching. */
@@ -383,6 +384,8 @@ class DefaultFetch
     Stats::Scalar<> fetchBlockedCycles;
     /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
+
+    Stats::Scalar<> fetchIcacheSquashes;
     /** Distribution of number of instructions fetched each cycle. */
     Stats::Distribution<> fetchNisnDist;
     Stats::Formula idleRate;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 7abc5733f..563a767df 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -178,6 +178,11 @@ DefaultFetch<Impl>::regStats()
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
 
+    fetchedBranches
+        .name(name() + ".fetchedBranches")
+        .desc("Number of branches that fetch encountered")
+        .prereq(fetchedBranches);
+
     predictedBranches
         .name(name() + ".predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
@@ -209,6 +214,11 @@ DefaultFetch<Impl>::regStats()
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
+    fetchIcacheSquashes
+        .name(name() + ".fetchIcacheSquashes")
+        .desc("Number of outstanding Icache misses that were squashed")
+        .prereq(fetchIcacheSquashes);
+
     fetchNisnDist
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
@@ -322,8 +332,10 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
     if (fetchStatus[tid] != IcacheMissStall ||
-        req != memReq[tid])
+        req != memReq[tid]) {
+        ++fetchIcacheSquashes;
         return;
+    }
 
     // Wake up the CPU (if it went to sleep and was waiting on this completion
     // event).
@@ -400,6 +412,8 @@ DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 
     predict_taken = branchPred.predict(inst, next_PC, inst->threadNumber);
 
+    ++fetchedBranches;
+
     if (predict_taken) {
         ++predictedBranches;
     }
@@ -457,6 +471,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // If translation was successful, attempt to read the first
     // instruction.
     if (fault == NoFault) {
+#if FULL_SYSTEM
         if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
@@ -464,6 +479,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
             ret_fault = TheISA::genMachineCheckFault();
             return false;
         }
+#endif
 
         DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
         fault = cpu->mem->read(memReq[tid], cacheData[tid]);
@@ -480,6 +496,8 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 
             MemAccessResult result = icacheInterface->access(memReq[tid]);
 
+            fetchedCacheLines++;
+
             // If the cache missed, then schedule an event to wake
             // up this stage once the cache miss completes.
             // @todo: Possibly allow for longer than 1 cycle cache hits.
@@ -499,8 +517,6 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
                         "read.\n", tid);
 
 //                memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
-
-                fetchedCacheLines++;
             }
         } else {
             DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
@@ -889,10 +905,14 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         if (!fetch_success)
             return;
     } else {
-        if (fetchStatus[tid] == Blocked) {
+        if (fetchStatus[tid] == Idle) {
+            ++fetchIdleCycles;
+        } else if (fetchStatus[tid] == Blocked) {
             ++fetchBlockedCycles;
         } else if (fetchStatus[tid] == Squashing) {
             ++fetchSquashCycles;
+        } else if (fetchStatus[tid] == IcacheMissStall) {
+            ++icacheStallCycles;
         }
 
         // Status is Idle, Squashing, Blocked, or IcacheMissStall, so
@@ -904,6 +924,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
 
     // If we had a stall due to an icache miss, then return.
     if (fetchStatus[tid] == IcacheMissStall) {
+        ++icacheStallCycles;
         status_change = true;
         return;
     }
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index e55837812..58cd68b21 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -278,6 +278,8 @@ class DefaultIEW
     void tick();
 
   private:
+    void updateExeInstStats(DynInstPtr &inst);
+
     /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
 
@@ -443,9 +445,9 @@ class DefaultIEW
     /** Stat for total number of executed instructions. */
     Stats::Scalar<> iewExecutedInsts;
     /** Stat for total number of executed load instructions. */
-    Stats::Scalar<> iewExecLoadInsts;
+    Stats::Vector<> iewExecLoadInsts;
     /** Stat for total number of executed store instructions. */
-    Stats::Scalar<> iewExecStoreInsts;
+//    Stats::Scalar<> iewExecStoreInsts;
     /** Stat for total number of squashed instructions skipped at execute. */
     Stats::Scalar<> iewExecSquashedInsts;
     /** Stat for total number of memory ordering violation events. */
@@ -456,6 +458,33 @@ class DefaultIEW
     Stats::Scalar<> predictedNotTakenIncorrect;
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
+
+    Stats::Vector<> exe_swp;
+    Stats::Vector<> exe_nop;
+    Stats::Vector<> exe_refs;
+    Stats::Vector<> exe_branches;
+
+//    Stats::Vector<> issued_ops;
+/*
+    Stats::Vector<> stat_fu_busy;
+    Stats::Vector2d<> stat_fuBusy;
+    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+*/
+    Stats::Formula issue_rate;
+    Stats::Formula iewExecStoreInsts;
+//    Stats::Formula issue_op_rate;
+//    Stats::Formula fu_busy_rate;
+
+    Stats::Vector<> iewInstsToCommit;
+    Stats::Vector<> writeback_count;
+    Stats::Vector<> producer_inst;
+    Stats::Vector<> consumer_inst;
+    Stats::Vector<> wb_penalized;
+
+    Stats::Formula wb_rate;
+    Stats::Formula wb_fanout;
+    Stats::Formula wb_penalized_rate;
 };
 
 #endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 21eb7dcf8..2ae2e1361 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -140,6 +140,8 @@ template <class Impl>
 void
 DefaultIEW<Impl>::regStats()
 {
+    using namespace Stats;
+
     instQueue.regStats();
 
     //ldstQueue.regStats();
@@ -195,13 +197,15 @@ DefaultIEW<Impl>::regStats()
         .desc("Number of executed instructions");
 
     iewExecLoadInsts
+        .init(cpu->number_of_threads)
         .name(name() + ".iewExecLoadInsts")
-        .desc("Number of load instructions executed");
-
+        .desc("Number of load instructions executed")
+        .flags(total);
+/*
     iewExecStoreInsts
         .name(name() + ".iewExecStoreInsts")
         .desc("Number of store instructions executed");
-
+*/
     iewExecSquashedInsts
         .name(name() + ".iewExecSquashedInsts")
         .desc("Number of squashed instructions skipped in execute");
@@ -223,6 +227,116 @@ DefaultIEW<Impl>::regStats()
         .desc("Number of branch mispredicts detected at execute");
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
+
+    exe_swp
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:swp")
+        .desc("number of swp insts executed")
+        .flags(total)
+        ;
+
+    exe_nop
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:nop")
+        .desc("number of nop insts executed")
+        .flags(total)
+        ;
+
+    exe_refs
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:refs")
+        .desc("number of memory reference insts executed")
+        .flags(total)
+        ;
+
+    exe_branches
+        .init(cpu->number_of_threads)
+        .name(name() + ".EXEC:branches")
+        .desc("Number of branches executed")
+        .flags(total)
+        ;
+
+    issue_rate
+        .name(name() + ".EXEC:rate")
+        .desc("Inst execution rate")
+        .flags(total)
+        ;
+    issue_rate = iewExecutedInsts / cpu->numCycles;
+
+    iewExecStoreInsts
+        .name(name() + ".EXEC:stores")
+        .desc("Number of stores executed")
+        .flags(total)
+        ;
+    iewExecStoreInsts = exe_refs - iewExecLoadInsts;
+/*
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+*/
+    //
+    //  Other stats
+    //
+
+    iewInstsToCommit
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:sent")
+        .desc("cumulative count of insts sent to commit")
+        .flags(total)
+        ;
+
+    writeback_count
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:count")
+        .desc("cumulative count of insts written-back")
+        .flags(total)
+        ;
+
+    producer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:producers")
+        .desc("num instructions producing a value")
+        .flags(total)
+        ;
+
+    consumer_inst
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:consumers")
+        .desc("num instructions consuming a value")
+        .flags(total)
+        ;
+
+    wb_penalized
+        .init(cpu->number_of_threads)
+        .name(name() + ".WB:penalized")
+        .desc("number of instrctions required to write to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate
+        .name(name() + ".WB:penalized_rate")
+        .desc ("fraction of instructions written-back that wrote to 'other' IQ")
+        .flags(total)
+        ;
+
+    wb_penalized_rate = wb_penalized / writeback_count;
+
+    wb_fanout
+        .name(name() + ".WB:fanout")
+        .desc("average fanout of values written-back")
+        .flags(total)
+        ;
+
+    wb_fanout = producer_inst / consumer_inst;
+
+    wb_rate
+        .name(name() + ".WB:rate")
+        .desc("insts written-back per cycle")
+        .flags(total)
+        ;
+    wb_rate = writeback_count / cpu->numCycles;
 }
 
 template<class Impl>
@@ -990,6 +1104,8 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             instQueue.advanceTail(inst);
 
+            exe_nop[tid]++;
+
             add_to_iq = false;
         } else if (inst->isExecuted()) {
             assert(0 && "Instruction shouldn't be executed.\n");
@@ -1124,11 +1240,11 @@ DefaultIEW<Impl>::executeInsts()
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
 
-                ++iewExecLoadInsts;
+//                ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
-                ++iewExecStoreInsts;
+//                ++iewExecStoreInsts;
 
                 // If the store had a fault then it may not have a mem req
                 if (inst->req && !(inst->req->flags & LOCKED)) {
@@ -1146,13 +1262,13 @@ DefaultIEW<Impl>::executeInsts()
         } else {
             inst->execute();
 
-            ++iewExecutedInsts;
-
             inst->setExecuted();
 
             instToCommit(inst);
         }
 
+        updateExeInstStats(inst);
+
         // Check if branch was correct.  This check happens after the
         // instruction is added to the queue because even if the branch
         // is mispredicted, the branch instruction itself is still valid.
@@ -1243,17 +1359,20 @@ DefaultIEW<Impl>::writebackInsts()
     for (int inst_num = 0; inst_num < issueWidth &&
              toCommit->insts[inst_num]; inst_num++) {
         DynInstPtr inst = toCommit->insts[inst_num];
+        int tid = inst->threadNumber;
 
         DPRINTF(IEW, "Sending instructions to commit, PC %#x.\n",
                 inst->readPC());
 
+        iewInstsToCommit[tid]++;
+
         // Some instructions will be sent to commit without having
         // executed because they need commit to handle them.
         // E.g. Uncached loads have not actually executed when they
         // are first sent to commit.  Instead commit must tell the LSQ
         // when it's ready to execute the uncached load.
         if (!inst->isSquashed() && inst->isExecuted()) {
-            instQueue.wakeDependents(inst);
+            int dependents = instQueue.wakeDependents(inst);
 
             for (int i = 0; i < inst->numDestRegs(); i++) {
                 //mark as Ready
@@ -1261,6 +1380,10 @@ DefaultIEW<Impl>::writebackInsts()
                         inst->renamedDestRegIdx(i));
                 scoreboard->setReg(inst->renamedDestRegIdx(i));
             }
+
+            producer_inst[tid]++;
+            consumer_inst[tid]+= dependents;
+            writeback_count[tid]++;
         }
     }
 }
@@ -1390,3 +1513,39 @@ DefaultIEW<Impl>::tick()
         cpu->activityThisCycle();
     }
 }
+
+template <class Impl>
+void
+DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
+{
+    int thread_number = inst->threadNumber;
+
+    //
+    //  Pick off the software prefetches
+    //
+#ifdef TARGET_ALPHA
+    if (inst->isDataPrefetch())
+        exe_swp[thread_number]++;
+    else
+        iewExecutedInsts++;
+#else
+    iewExecutedInsts[thread_number]++;
+#endif
+
+    //
+    //  Control operations
+    //
+    if (inst->isControl())
+        exe_branches[thread_number]++;
+
+    //
+    //  Memory operations
+    //
+    if (inst->isMemRef()) {
+        exe_refs[thread_number]++;
+
+        if (inst->isLoad()) {
+            iewExecLoadInsts[thread_number]++;
+        }
+    }
+}
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 283bbdc22..06d9937f2 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -185,7 +185,7 @@ class InstructionQueue
     void commit(const InstSeqNum &inst, unsigned tid = 0);
 
     /** Wakes all dependents of a completed instruction. */
-    void wakeDependents(DynInstPtr &completed_inst);
+    int wakeDependents(DynInstPtr &completed_inst);
 
     /** Adds a ready memory instruction to the ready list. */
     void addReadyMemInst(DynInstPtr &ready_inst);
@@ -479,6 +479,7 @@ class InstructionQueue
     /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
 //    Stats::Scalar<> iqIntInstsAdded;
+    Stats::Scalar<> iqInstsIssued;
     /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
 //    Stats::Scalar<> iqFloatInstsAdded;
@@ -505,6 +506,20 @@ class InstructionQueue
      */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
+    Stats::VectorDistribution<> queue_res_dist;
+    Stats::Vector<> n_issued_dist;
+    Stats::VectorDistribution<> issue_delay_dist;
+
+    Stats::Vector<> stat_fu_busy;
+//    Stats::Vector<> dist_unissued;
+    Stats::Vector2d<> stat_issued_inst_type;
+
+    Stats::Formula issue_rate;
+//    Stats::Formula issue_stores;
+//    Stats::Formula issue_op_rate;
+    Stats::Vector<> fu_busy;  //cumulative fu busy
+
+    Stats::Formula fu_busy_rate;
 };
 
 #endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index cfdd25cd5..804bc2472 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -224,6 +224,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::regStats()
 {
+    using namespace Stats;
     iqInstsAdded
         .name(name() + ".iqInstsAdded")
         .desc("Number of instructions added to the IQ (excludes non-spec)")
@@ -236,6 +237,11 @@ InstructionQueue<Impl>::regStats()
 
 //    iqIntInstsAdded;
 
+    iqInstsIssued
+        .name(name() + ".iqInstsIssued")
+        .desc("Number of instructions issued")
+        .prereq(iqInstsIssued);
+
     iqIntInstsIssued
         .name(name() + ".iqIntInstsIssued")
         .desc("Number of integer instructions issued")
@@ -291,6 +297,103 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
+    queue_res_dist
+        .init(Num_OpClasses, 0, 99, 2)
+        .name(name() + ".IQ:residence:")
+        .desc("cycles from dispatch to issue")
+        .flags(total | pdf | cdf )
+        ;
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        queue_res_dist.subname(i, opClassStrings[i]);
+    }
+    n_issued_dist
+        .init(totalWidth + 1)
+        .name(name() + ".ISSUE:issued_per_cycle")
+        .desc("Number of insts issued each cycle")
+        .flags(total | pdf | dist)
+        ;
+/*
+    dist_unissued
+        .init(Num_OpClasses+2)
+        .name(name() + ".ISSUE:unissued_cause")
+        .desc("Reason ready instruction not issued")
+        .flags(pdf | dist)
+        ;
+    for (int i=0; i < (Num_OpClasses + 2); ++i) {
+        dist_unissued.subname(i, unissued_names[i]);
+    }
+*/
+    stat_issued_inst_type
+        .init(numThreads,Num_OpClasses)
+        .name(name() + ".ISSUE:FU_type")
+        .desc("Type of FU issued")
+        .flags(total | pdf | dist)
+        ;
+    stat_issued_inst_type.ysubnames(opClassStrings);
+
+    //
+    //  How long did instructions for a particular FU type wait prior to issue
+    //
+
+    issue_delay_dist
+        .init(Num_OpClasses,0,99,2)
+        .name(name() + ".ISSUE:")
+        .desc("cycles from operands ready to issue")
+        .flags(pdf | cdf)
+        ;
+
+    for (int i=0; i<Num_OpClasses; ++i) {
+        stringstream subname;
+        subname << opClassStrings[i] << "_delay";
+        issue_delay_dist.subname(i, subname.str());
+    }
+
+    issue_rate
+        .name(name() + ".ISSUE:rate")
+        .desc("Inst issue rate")
+        .flags(total)
+        ;
+    issue_rate = iqInstsIssued / cpu->numCycles;
+/*
+    issue_stores
+        .name(name() + ".ISSUE:stores")
+        .desc("Number of stores issued")
+        .flags(total)
+        ;
+    issue_stores = exe_refs - exe_loads;
+*/
+/*
+    issue_op_rate
+        .name(name() + ".ISSUE:op_rate")
+        .desc("Operation issue rate")
+        .flags(total)
+        ;
+    issue_op_rate = issued_ops / numCycles;
+*/
+    stat_fu_busy
+        .init(Num_OpClasses)
+        .name(name() + ".ISSUE:fu_full")
+        .desc("attempts to use FU when none available")
+        .flags(pdf | dist)
+        ;
+    for (int i=0; i < Num_OpClasses; ++i) {
+        stat_fu_busy.subname(i, opClassStrings[i]);
+    }
+
+    fu_busy
+        .init(numThreads)
+        .name(name() + ".ISSUE:fu_busy_cnt")
+        .desc("FU busy when requested")
+        .flags(total)
+        ;
+
+    fu_busy_rate
+        .name(name() + ".ISSUE:fu_busy_rate")
+        .desc("FU busy rate (busy events/executed inst)")
+        .flags(total)
+        ;
+    fu_busy_rate = fu_busy / iqInstsIssued;
+
     for ( int i=0; i < numThreads; i++) {
         // Tell mem dependence unit to reg stats as well.
         memDepUnit[i].regStats();
@@ -658,6 +761,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         int idx = fuPool->getUnit(op_class);
 
+        int tid = issuing_inst->threadNumber;
+
         if (idx == -2) {
             assert(op_class == No_OpClass);
 
@@ -666,7 +771,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
                     " %#x [sn:%lli]\n",
-                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    tid, issuing_inst->readPC(),
                     issuing_inst->seqNum);
 
             readyInsts[op_class].pop();
@@ -685,14 +790,15 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
                 ++freeEntries;
-                count[issuing_inst->threadNumber]--;
+                count[tid]--;
                 issuing_inst->removeInIQ();
             } else {
-                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+                memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
 
+            stat_issued_inst_type[tid][op_class]++;
         } else if (idx != -1) {
             int op_latency = fuPool->getOpLatency(op_class);
 
@@ -722,7 +828,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             DPRINTF(IQ, "Thread %i: Issuing instruction PC %#x "
                     "[sn:%lli]\n",
-                    issuing_inst->threadNumber, issuing_inst->readPC(),
+                    tid, issuing_inst->readPC(),
                     issuing_inst->seqNum);
 
             readyInsts[op_class].pop();
@@ -741,14 +847,17 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                 // Memory instructions can not be freed from the IQ until they
                 // complete.
                 ++freeEntries;
-                count[issuing_inst->threadNumber]--;
+                count[tid]--;
                 issuing_inst->removeInIQ();
             } else {
-                memDepUnit[issuing_inst->threadNumber].issue(issuing_inst);
+                memDepUnit[tid].issue(issuing_inst);
             }
 
             listOrder.erase(order_it++);
+            stat_issued_inst_type[tid][op_class]++;
         } else {
+            stat_fu_busy[op_class]++;
+            fu_busy[tid]++;
             ++order_it;
         }
     }
@@ -808,9 +917,11 @@ InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
 }
 
 template <class Impl>
-void
+int
 InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 {
+    int dependents = 0;
+
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
@@ -875,6 +986,8 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
             curr = prev->next;
             prev->inst = NULL;
 
+            ++dependents;
+
             delete prev;
         }
 
@@ -886,6 +999,7 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
         // Mark the scoreboard as having that register ready.
         regScoreboard[dest_reg] = true;
     }
+    return dependents;
 }
 
 template <class Impl>
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index d5beccde9..c6f8f97aa 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -90,7 +90,7 @@ class DefaultRename
         Squashing,
         Blocked,
         Unblocking,
-        BarrierStall
+        SerializeStall
     };
 
   private:
@@ -359,8 +359,8 @@ class DefaultRename
     /** Tracks which stages are telling decode to stall. */
     Stalls stalls[Impl::MaxThreads];
 
-    /** The barrier instruction that rename has stalled on. */
-    DynInstPtr barrierInst[Impl::MaxThreads];
+    /** The serialize instruction that rename has stalled on. */
+    DynInstPtr serializeInst[Impl::MaxThreads];
 
     /** Records if rename needs to serialize on the next instruction for any
      * thread.
@@ -419,8 +419,8 @@ class DefaultRename
     Stats::Scalar<> renameIdleCycles;
     /** Stat for total number of cycles spent blocking. */
     Stats::Scalar<> renameBlockCycles;
-    /** Stat for total number of cycles spent stalling for a barrier. */
-    Stats::Scalar<> renameBarrierCycles;
+    /** Stat for total number of cycles spent stalling for a serializing inst. */
+    Stats::Scalar<> renameSerializeStallCycles;
     /** Stat for total number of cycles spent running normally. */
     Stats::Scalar<> renameRunCycles;
     /** Stat for total number of cycles spent unblocking. */
@@ -446,6 +446,8 @@ class DefaultRename
     Stats::Scalar<> renameCommittedMaps;
     /** Stat for total number of mappings that were undone due to a squash. */
     Stats::Scalar<> renameUndoneMaps;
+    Stats::Scalar<> renamedSerializing;
+    Stats::Scalar<> renamedTempSerializing;
 };
 
 #endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 441118ef1..e29211921 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -53,7 +53,7 @@ DefaultRename<Impl>::DefaultRename(Params *params)
 
         stalls[i].iew = false;
         stalls[i].commit = false;
-        barrierInst[i] = NULL;
+        serializeInst[i] = NULL;
 
         instsInProgress[i] = 0;
 
@@ -78,69 +78,79 @@ void
 DefaultRename<Impl>::regStats()
 {
     renameSquashCycles
-        .name(name() + ".renameSquashCycles")
+        .name(name() + ".RENAME:SquashCycles")
         .desc("Number of cycles rename is squashing")
         .prereq(renameSquashCycles);
     renameIdleCycles
-        .name(name() + ".renameIdleCycles")
+        .name(name() + ".RENAME:IdleCycles")
         .desc("Number of cycles rename is idle")
         .prereq(renameIdleCycles);
     renameBlockCycles
-        .name(name() + ".renameBlockCycles")
+        .name(name() + ".RENAME:BlockCycles")
         .desc("Number of cycles rename is blocking")
         .prereq(renameBlockCycles);
-    renameBarrierCycles
-        .name(name() + ".renameBarrierCycles")
-        .desc("Number of cycles rename is blocking due to a barrier stall")
-        .prereq(renameBarrierCycles);
+    renameSerializeStallCycles
+        .name(name() + ".RENAME:serializeStallCycles")
+        .desc("count of cycles rename stalled for serializing inst")
+        .flags(Stats::total);
     renameRunCycles
-        .name(name() + ".renameRunCycles")
+        .name(name() + ".RENAME:RunCycles")
         .desc("Number of cycles rename is running")
         .prereq(renameIdleCycles);
     renameUnblockCycles
-        .name(name() + ".renameUnblockCycles")
+        .name(name() + ".RENAME:UnblockCycles")
         .desc("Number of cycles rename is unblocking")
         .prereq(renameUnblockCycles);
     renameRenamedInsts
-        .name(name() + ".renameRenamedInsts")
+        .name(name() + ".RENAME:RenamedInsts")
         .desc("Number of instructions processed by rename")
         .prereq(renameRenamedInsts);
     renameSquashedInsts
-        .name(name() + ".renameSquashedInsts")
+        .name(name() + ".RENAME:SquashedInsts")
         .desc("Number of squashed instructions processed by rename")
         .prereq(renameSquashedInsts);
     renameROBFullEvents
-        .name(name() + ".renameROBFullEvents")
+        .name(name() + ".RENAME:ROBFullEvents")
         .desc("Number of times rename has blocked due to ROB full")
         .prereq(renameROBFullEvents);
     renameIQFullEvents
-        .name(name() + ".renameIQFullEvents")
+        .name(name() + ".RENAME:IQFullEvents")
         .desc("Number of times rename has blocked due to IQ full")
         .prereq(renameIQFullEvents);
     renameLSQFullEvents
-        .name(name() + ".renameLSQFullEvents")
+        .name(name() + ".RENAME:LSQFullEvents")
         .desc("Number of times rename has blocked due to LSQ full")
         .prereq(renameLSQFullEvents);
     renameFullRegistersEvents
-        .name(name() + ".renameFullRegisterEvents")
+        .name(name() + ".RENAME:FullRegisterEvents")
         .desc("Number of times there has been no free registers")
         .prereq(renameFullRegistersEvents);
     renameRenamedOperands
-        .name(name() + ".renameRenamedOperands")
+        .name(name() + ".RENAME:RenamedOperands")
         .desc("Number of destination operands rename has renamed")
         .prereq(renameRenamedOperands);
     renameRenameLookups
-        .name(name() + ".renameRenameLookups")
+        .name(name() + ".RENAME:RenameLookups")
         .desc("Number of register rename lookups that rename has made")
         .prereq(renameRenameLookups);
     renameCommittedMaps
-        .name(name() + ".renameCommittedMaps")
+        .name(name() + ".RENAME:CommittedMaps")
         .desc("Number of HB maps that are committed")
         .prereq(renameCommittedMaps);
     renameUndoneMaps
-        .name(name() + ".renameUndoneMaps")
+        .name(name() + ".RENAME:UndoneMaps")
         .desc("Number of HB maps that are undone due to squashing")
         .prereq(renameUndoneMaps);
+    renamedSerializing
+        .name(name() + ".RENAME:serializingInsts")
+        .desc("count of serializing insts renamed")
+        .flags(Stats::total)
+        ;
+    renamedTempSerializing
+        .name(name() + ".RENAME:tempSerializingInsts")
+        .desc("count of temporary serializing insts renamed")
+        .flags(Stats::total)
+        ;
 }
 
 template <class Impl>
@@ -254,7 +264,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     // cycle and there should be space to hold everything due to the squash.
     if (renameStatus[tid] == Blocked ||
         renameStatus[tid] == Unblocking ||
-        renameStatus[tid] == BarrierStall) {
+        renameStatus[tid] == SerializeStall) {
 #if !FULL_SYSTEM
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
@@ -267,7 +277,7 @@ DefaultRename<Impl>::squash(unsigned tid)
 #else
         toDecode->renameUnblock[tid] = 1;
 #endif
-        barrierInst[tid] = NULL;
+        serializeInst[tid] = NULL;
     }
 
     // Set the status to Squashing.
@@ -370,8 +380,8 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
         ++renameBlockCycles;
     } else if (renameStatus[tid] == Squashing) {
         ++renameSquashCycles;
-    } else if (renameStatus[tid] == BarrierStall) {
-        ++renameBarrierCycles;
+    } else if (renameStatus[tid] == SerializeStall) {
+        ++renameSerializeStallCycles;
     }
 
     if (renameStatus[tid] == Running ||
@@ -535,14 +545,18 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize before instruction encountered.\n");
 
-            if (!inst->isTempSerializeBefore())
+            if (!inst->isTempSerializeBefore()) {
+                renamedSerializing++;
                 inst->setSerializeHandled();
+            } else {
+                renamedTempSerializing++;
+            }
 
-            // Change status over to BarrierStall so that other stages know
+            // Change status over to SerializeStall so that other stages know
             // what this is blocked on.
-            renameStatus[tid] = BarrierStall;
+            renameStatus[tid] = SerializeStall;
 
-            barrierInst[tid] = inst;
+            serializeInst[tid] = inst;
 
             blockThisCycle = true;
 
@@ -716,9 +730,9 @@ DefaultRename<Impl>::block(unsigned tid)
             wroteToTimeBuffer = true;
         }
 
-        // Rename can not go from BarrierStall to Blocked, otherwise it would
-        // not know to complete the barrier stall.
-        if (renameStatus[tid] != BarrierStall) {
+        // Rename can not go from SerializeStall to Blocked, otherwise it would
+        // not know to complete the serialize stall.
+        if (renameStatus[tid] != SerializeStall) {
             // Set status to Blocked.
             renameStatus[tid] = Blocked;
             return true;
@@ -735,7 +749,7 @@ DefaultRename<Impl>::unblock(unsigned tid)
     DPRINTF(Rename, "[tid:%u]: Trying to unblock.\n", tid);
 
     // Rename is done unblocking if the skid buffer is empty.
-    if (skidBuffer[tid].empty() && renameStatus[tid] != BarrierStall) {
+    if (skidBuffer[tid].empty() && renameStatus[tid] != SerializeStall) {
 
         DPRINTF(Rename, "[tid:%u]: Done unblocking.\n", tid);
 
@@ -1008,9 +1022,9 @@ DefaultRename<Impl>::checkStall(unsigned tid)
     } else if (renameMap[tid]->numFreeEntries() <= 0) {
         DPRINTF(Rename,"[tid:%i]: Stall: RenameMap has 0 free entries.\n", tid);
         ret_val = true;
-    } else if (renameStatus[tid] == BarrierStall &&
+    } else if (renameStatus[tid] == SerializeStall &&
                (!emptyROB[tid] || instsInProgress[tid])) {
-        DPRINTF(Rename,"[tid:%i]: Stall: Barrier stall and ROB is not "
+        DPRINTF(Rename,"[tid:%i]: Stall: Serialize stall and ROB is not "
                 "empty.\n",
                 tid);
         ret_val = true;
@@ -1064,7 +1078,7 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
     //         if so then go to unblocking
     // If status was Squashing
     //     check if squashing is not high.  Switch to running this cycle.
-    // If status was barrier stall
+    // If status was serialize stall
     //     check if ROB is empty and no insts are in flight to the ROB
 
     readFreeEntries(tid);
@@ -1113,12 +1127,12 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
         return false;
     }
 
-    if (renameStatus[tid] == BarrierStall) {
+    if (renameStatus[tid] == SerializeStall) {
         // Stall ends once the ROB is free.
-        DPRINTF(Rename, "[tid:%u]: Done with barrier stall, switching to "
+        DPRINTF(Rename, "[tid:%u]: Done with serialize stall, switching to "
                 "unblocking.\n", tid);
 
-        DynInstPtr barr_inst = barrierInst[tid];
+        DynInstPtr serial_inst = serializeInst[tid];
 
         renameStatus[tid] = Unblocking;
 
@@ -1126,21 +1140,21 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         DPRINTF(Rename, "[tid:%u]: Processing instruction [%lli] with "
                 "PC %#x.\n",
-                tid, barr_inst->seqNum, barr_inst->readPC());
+                tid, serial_inst->seqNum, serial_inst->readPC());
 
         // Put instruction into queue here.
-        barr_inst->clearSerializeBefore();
+        serial_inst->clearSerializeBefore();
 
         if (!skidBuffer[tid].empty()) {
-            skidBuffer[tid].push_front(barr_inst);
+            skidBuffer[tid].push_front(serial_inst);
         } else {
-            insts[tid].push_front(barr_inst);
+            insts[tid].push_front(serial_inst);
         }
 
         DPRINTF(Rename, "[tid:%u]: Instruction must be processed by rename."
                 " Adding to front of list.", tid);
 
-        barrierInst[tid] = NULL;
+        serializeInst[tid] = NULL;
 
         return true;
     }
-- 
cgit v1.2.3


From 31e09892d750d0e6dc7de3d455e34808c159a420 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Mon, 24 Apr 2006 17:11:31 -0400
Subject: Include option for disabling PC symbols.

cpu/inst_seq.hh:
cpu/o3/cpu.cc:
cpu/ozone/cpu_builder.cc:
cpu/ozone/thread_state.hh:
    SE build fixes.

--HG--
extra : convert_revision : a4df6128533105f849b5469f62d83dffe299b7df
---
 cpu/o3/cpu.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index d322037bc..ac8c4236e 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -123,7 +123,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       physmem(system->physmem),
       mem(params->mem),
 #else
-      pTable(params->pTable),
+//      pTable(params->pTable),
 #endif // FULL_SYSTEM
 
       icacheInterface(params->icacheInterface),
@@ -238,8 +238,8 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
     // Setup the page table for whichever stages need it.
 #if !FULL_SYSTEM
-    fetch.setPageTable(pTable);
-    iew.setPageTable(pTable);
+//    fetch.setPageTable(pTable);
+//    iew.setPageTable(pTable);
 #endif
 
     // Setup the ROB for whichever stages need it.
@@ -885,11 +885,9 @@ template <class Impl>
 void
 FullO3CPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    unsigned tid = inst->threadNumber;
-
     DPRINTF(FullCPU, "FullCPU: Removing committed instruction [tid:%i] PC %#x "
             "[sn:%lli]\n",
-            tid, inst->readPC(), inst->seqNum);
+            inst->threadNumber, inst->readPC(), inst->seqNum);
 
     removeInstsThisCycle = true;
 
-- 
cgit v1.2.3


From f3358e5f7b6452f14a6df5106129ef0cb2ed8b65 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 4 May 2006 11:36:20 -0400
Subject: O3 CPU now handles being used with the sampler.

cpu/o3/2bit_local_pred.cc:
cpu/o3/2bit_local_pred.hh:
cpu/o3/bpred_unit.hh:
cpu/o3/bpred_unit_impl.hh:
cpu/o3/btb.cc:
cpu/o3/btb.hh:
cpu/o3/commit.hh:
cpu/o3/commit_impl.hh:
cpu/o3/cpu.cc:
cpu/o3/cpu.hh:
cpu/o3/decode.hh:
cpu/o3/decode_impl.hh:
cpu/o3/fetch.hh:
cpu/o3/fetch_impl.hh:
cpu/o3/fu_pool.cc:
cpu/o3/fu_pool.hh:
cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
cpu/o3/inst_queue.hh:
cpu/o3/inst_queue_impl.hh:
cpu/o3/lsq.hh:
cpu/o3/lsq_impl.hh:
cpu/o3/lsq_unit.hh:
cpu/o3/lsq_unit_impl.hh:
cpu/o3/mem_dep_unit.hh:
cpu/o3/mem_dep_unit_impl.hh:
cpu/o3/ras.cc:
cpu/o3/ras.hh:
cpu/o3/rename.hh:
cpu/o3/rename_impl.hh:
cpu/o3/rob.hh:
cpu/o3/rob_impl.hh:
cpu/o3/sat_counter.cc:
cpu/o3/sat_counter.hh:
cpu/o3/thread_state.hh:
    Handle switching out and taking over.  Needs to be able to reset all state.
cpu/o3/alpha_cpu_impl.hh:
    Handle taking over from another XC.

--HG--
extra : convert_revision : b936e826f0f8a18319bfa940ff35097b4192b449
---
 cpu/o3/2bit_local_pred.cc   |   8 +++
 cpu/o3/2bit_local_pred.hh   |   2 +
 cpu/o3/alpha_cpu_impl.hh    |  20 ++++++
 cpu/o3/bpred_unit.hh        |   4 ++
 cpu/o3/bpred_unit_impl.hh   |  21 ++++++
 cpu/o3/btb.cc               |   8 +++
 cpu/o3/btb.hh               |   2 +
 cpu/o3/commit.hh            |   6 ++
 cpu/o3/commit_impl.hh       |  38 ++++++++++-
 cpu/o3/cpu.cc               |  76 +++++++++++++++++----
 cpu/o3/cpu.hh               |   9 +--
 cpu/o3/decode.hh            |   5 ++
 cpu/o3/decode_impl.hh       |  50 +++++++++++---
 cpu/o3/fetch.hh             |  13 ++++
 cpu/o3/fetch_impl.hh        |  70 ++++++++++++++-----
 cpu/o3/fu_pool.cc           |  14 ++++
 cpu/o3/fu_pool.hh           |   3 +
 cpu/o3/iew.hh               |   8 +++
 cpu/o3/iew_impl.hh          |  56 ++++++++++++++--
 cpu/o3/inst_queue.hh        |  14 +++-
 cpu/o3/inst_queue_impl.hh   | 160 ++++++++++++++++++++++++++++----------------
 cpu/o3/lsq.hh               |  12 +---
 cpu/o3/lsq_impl.hh          |  19 +++++-
 cpu/o3/lsq_unit.hh          |  49 ++++++++++++--
 cpu/o3/lsq_unit_impl.hh     |  90 ++++++++++++++++++++++++-
 cpu/o3/mem_dep_unit.hh      |   4 ++
 cpu/o3/mem_dep_unit_impl.hh |  20 ++++++
 cpu/o3/ras.cc               |   9 +++
 cpu/o3/ras.hh               |   2 +
 cpu/o3/rename.hh            |   5 ++
 cpu/o3/rename_impl.hh       |  67 +++++++++++++++++--
 cpu/o3/rob.hh               |   4 ++
 cpu/o3/rob_impl.hh          |  25 +++++++
 cpu/o3/sat_counter.cc       |  24 ++-----
 cpu/o3/sat_counter.hh       |  19 +++++-
 cpu/o3/thread_state.hh      |   2 +-
 36 files changed, 786 insertions(+), 152 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index 458fbd663..eab98531d 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -67,6 +67,14 @@ DefaultBP::DefaultBP(unsigned _localPredictorSize,
             instShiftAmt);
 }
 
+void
+DefaultBP::reset()
+{
+    for (int i = 0; i < localPredictorSets; ++i) {
+        localCtrs[i].reset();
+    }
+}
+
 bool
 DefaultBP::lookup(Addr &branch_addr)
 {
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 38d3f4842..0dfe53819 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -62,6 +62,8 @@ class DefaultBP
      */
     void update(Addr &branch_addr, bool taken);
 
+    void reset();
+
   private:
 
     /**
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 86f7d9f28..7a2d5d2b9 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -151,6 +151,26 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
 {
+    // some things should already be set up
+    assert(getMemPtr() == old_context->getMemPtr());
+#if FULL_SYSTEM
+    assert(getSystemPtr() == old_context->getSystemPtr());
+#else
+    assert(getProcessPtr() == old_context->getProcessPtr());
+#endif
+
+    // copy over functional state
+    setStatus(old_context->status());
+    copyArchRegs(old_context);
+    setCpuId(old_context->readCpuId());
+#if !FULL_SYSTEM
+    thread->funcExeInst = old_context->readFuncExeInst();
+#endif
+
+    old_context->setStatus(ExecContext::Unallocated);
+
+    thread->inSyscall = false;
+    thread->trapPending = false;
 }
 
 template <class Impl>
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index 67c300989..ee7ffc183 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -67,6 +67,10 @@ class TwobitBPredUnit
      */
     void regStats();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /**
      * Predicts whether or not the instruction is a taken branch, and the
      * target of the branch if it is taken.
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index f79b67b6c..872c0c62e 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -94,6 +94,26 @@ TwobitBPredUnit<Impl>::regStats()
         ;
 }
 
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::switchOut()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        predHist[i].clear();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::takeOverFrom()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i)
+        RAS[i].reset();
+
+    BP.reset();
+    BTB.reset();
+}
+
 template <class Impl>
 bool
 TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC, unsigned tid)
@@ -297,5 +317,6 @@ TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
         BP.update(pred_hist.front().PC, actually_taken);
 
         BTB.update(pred_hist.front().PC, corr_target, tid);
+        pred_hist.pop_front();
     }
 }
diff --git a/cpu/o3/btb.cc b/cpu/o3/btb.cc
index e084142d7..e5f69043a 100644
--- a/cpu/o3/btb.cc
+++ b/cpu/o3/btb.cc
@@ -58,6 +58,14 @@ DefaultBTB::DefaultBTB(unsigned _numEntries,
     tagShiftAmt = instShiftAmt + floorLog2(numEntries);
 }
 
+void
+DefaultBTB::reset()
+{
+    for (int i = 0; i < numEntries; ++i) {
+        btb[i].valid = false;
+    }
+}
+
 inline
 unsigned
 DefaultBTB::getIndex(const Addr &inst_PC)
diff --git a/cpu/o3/btb.hh b/cpu/o3/btb.hh
index aaa9945f7..b9ff42573 100644
--- a/cpu/o3/btb.hh
+++ b/cpu/o3/btb.hh
@@ -65,6 +65,8 @@ class DefaultBTB
     DefaultBTB(unsigned numEntries, unsigned tagBits,
                unsigned instShiftAmt);
 
+    void reset();
+
     /** Looks up an address in the BTB. Must call valid() first on the address.
      *  @param inst_PC The address of the branch to look up.
      *  @param tid The thread id.
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index f374b8fb7..028bd5295 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -175,6 +175,10 @@ class DefaultCommit
     /** Initializes stage by sending back the number of free entries. */
     void initStage();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Ticks the commit stage, which tries to commit instructions. */
     void tick();
 
@@ -351,6 +355,8 @@ class DefaultCommit
     /** Number of Active Threads */
     unsigned numThreads;
 
+    bool switchedOut;
+
     Tick trapLatency;
 
     Tick fetchTrapLatency;
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 157e688c7..7834460e2 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -54,6 +54,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::TrapEvent::process()
 {
+    // This will get reset if it was switched out.
     commit->trapSquash[tid] = true;
 }
 
@@ -75,7 +76,8 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       renameWidth(params->renameWidth),
       iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
-      numThreads(params->numberOfThreads)
+      numThreads(params->numberOfThreads),
+      switchedOut(false)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -254,6 +256,9 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     // Commit must broadcast the number of free entries it has at the start of
     // the simulation, so it starts as active.
     cpu->activateStage(FullCPU::CommitIdx);
+
+    trapLatency = cpu->cycles(6);
+    fetchTrapLatency = cpu->cycles(12);
 }
 
 template <class Impl>
@@ -360,6 +365,29 @@ DefaultCommit<Impl>::initStage()
     cpu->activityThisCycle();
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::switchOut()
+{
+    rob->switchOut();
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::takeOverFrom()
+{
+    _status = Active;
+    _nextStatus = Inactive;
+    for (int i=0; i < numThreads; i++) {
+        commitStatus[i] = Idle;
+        changedROBNumEntries[i] = false;
+        trapSquash[i] = false;
+        xcSquash[i] = false;
+    }
+    squashCounter = 0;
+    rob->takeOverFrom();
+}
+
 template <class Impl>
 void
 DefaultCommit<Impl>::updateStatus()
@@ -719,8 +747,9 @@ DefaultCommit<Impl>::commit()
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
-        if (fromFetch->fetchFault) {
+        if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
             // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
+            // Ignore it if there's already a trap pending as fetch will be redirected.
             fetchFault = fromFetch->fetchFault;
             fetchFaultSN = fromFetch->fetchFaultSN;
             fetchFaultTick = curTick + fetchTrapLatency;
@@ -975,6 +1004,7 @@ DefaultCommit<Impl>::commitInsts()
                 }
 
                 PC[tid] = nextPC[tid];
+                nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst);
 #if FULL_SYSTEM
                 int count = 0;
                 Addr oldpc;
@@ -1002,6 +1032,10 @@ DefaultCommit<Impl>::commitInsts()
 
     DPRINTF(CommitRate, "%i\n", num_committed);
     numCommittedDist.sample(num_committed);
+
+    if (num_committed == commitWidth) {
+        commit_eligible[0]++;
+    }
 }
 
 template <class Impl>
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index ac8c4236e..fc8372026 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -124,6 +124,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       mem(params->mem),
 #else
 //      pTable(params->pTable),
+      mem(params->workload[0]->getMemory()),
 #endif // FULL_SYSTEM
 
       icacheInterface(params->icacheInterface),
@@ -176,9 +177,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     numThreads = number_of_threads;
 
 #if !FULL_SYSTEM
-    int activeThreads = params->workload.size();
+    int active_threads = params->workload.size();
 #else
-    int activeThreads = 1;
+    int active_threads = 1;
 #endif
 
     assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
@@ -192,7 +193,7 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     PhysRegIndex freg_idx = params->numPhysIntRegs; //Index to 1 after int regs
 
     for (int tid=0; tid < numThreads; tid++) {
-        bool bindRegs = (tid <= activeThreads - 1);
+        bool bindRegs = (tid <= active_threads - 1);
 
         commitRenameMap[tid].init(TheISA::NumIntRegs,
                                   params->numPhysIntRegs,
@@ -357,7 +358,7 @@ FullO3CPU<Impl>::tick()
     }
 
     if (activityCount && !tickEvent.scheduled()) {
-        tickEvent.schedule(curTick + 1);
+        tickEvent.schedule(curTick + cycles(1));
     }
 
 #if !FULL_SYSTEM
@@ -370,8 +371,8 @@ template <class Impl>
 void
 FullO3CPU<Impl>::init()
 {
-    if (deferRegistration) {
-        return;
+    if (!deferRegistration) {
+        registerExecContexts();
     }
 
     // Set inSyscall so that the CPU doesn't squash when initially
@@ -379,7 +380,6 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = true;
 
-    registerExecContexts();
 
     // Need to do a copy of the xc->regs into the CPU's regfile so
     // that it can start properly.
@@ -388,7 +388,7 @@ FullO3CPU<Impl>::init()
         // Need to do a copy of the xc->regs into the CPU's regfile so
         // that it can start properly.
 #if FULL_SYSTEM
-        ExecContext *src_xc = system->execContexts[tid];
+        ExecContext *src_xc = execContexts[tid];
 #else
         ExecContext *src_xc = thread[tid]->getXCProxy();
 #endif
@@ -584,7 +584,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
         activeThreads.push_back(tid);
     }
 
-    assert(_status == Idle);
+    assert(_status == Idle || _status == SwitchedOut);
 
     scheduleTickEvent(delay);
 
@@ -658,21 +658,64 @@ FullO3CPU<Impl>::haltContext(int tid)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::switchOut()
+FullO3CPU<Impl>::switchOut(Sampler *sampler)
 {
-    panic("FullO3CPU does not have a switch out function.\n");
+//    panic("FullO3CPU does not have a switch out function.\n");
+    fetch.switchOut();
+    decode.switchOut();
+    rename.switchOut();
+    iew.switchOut();
+    commit.switchOut();
+    if (tickEvent.scheduled())
+        tickEvent.squash();
+    sampler->signalSwitched();
+    _status = SwitchedOut;
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
+    for (int i = 0; i < 6; ++i) {
+        timeBuffer.advance();
+        fetchQueue.advance();
+        decodeQueue.advance();
+        renameQueue.advance();
+        iewQueue.advance();
+        activityBuffer.advance();
+    }
+
+    activityCount = 0;
+    bzero(&stageActive, sizeof(stageActive));
+
     BaseCPU::takeOverFrom(oldCPU);
 
+    fetch.takeOverFrom();
+    decode.takeOverFrom();
+    rename.takeOverFrom();
+    iew.takeOverFrom();
+    commit.takeOverFrom();
+
     assert(!tickEvent.scheduled());
 
+    // @todo: Figure out how to properly select the tid to put onto the active threads list.
+    int tid = 0;
+
+    list<unsigned>::iterator isActive = find(
+        activeThreads.begin(), activeThreads.end(), tid);
+
+    if (isActive == activeThreads.end()) {
+        //May Need to Re-code this if the delay variable is the
+        //delay needed for thread to activate
+        DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
+                tid);
+
+        activeThreads.push_back(tid);
+    }
+
     // Set all status's to active, schedule the
     // CPU's tick event.
+    // @todo: Fix up statuses so this is handled properly
     for (int i = 0; i < execContexts.size(); ++i) {
         ExecContext *xc = execContexts[i];
         if (xc->status() == ExecContext::Active && _status != Running) {
@@ -680,6 +723,8 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
             tickEvent.schedule(curTick);
         }
     }
+    if (!tickEvent.scheduled())
+        tickEvent.schedule(curTick);
 }
 
 template <class Impl>
@@ -758,7 +803,8 @@ template <class Impl>
 float
 FullO3CPU<Impl>::readArchFloatRegSingle(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegSingle(phys_reg);
 }
@@ -767,7 +813,8 @@ template <class Impl>
 double
 FullO3CPU<Impl>::readArchFloatRegDouble(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegDouble(phys_reg);
 }
@@ -776,7 +823,8 @@ template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readArchFloatRegInt(int reg_idx, unsigned tid)
 {
-    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
+    int idx = reg_idx + TheISA::FP_Base_DepTag;
+    PhysRegIndex phys_reg = commitRenameMap[tid].lookup(idx);
 
     return regFile.readFloatRegInt(phys_reg);
 }
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 91eaf9d6f..621ddf541 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -82,7 +82,8 @@ class FullO3CPU : public BaseFullCPU
         Running,
         Idle,
         Halted,
-        Blocked
+        Blocked,
+        SwitchedOut
     };
 
     /** Overall CPU status. */
@@ -112,9 +113,9 @@ class FullO3CPU : public BaseFullCPU
     void scheduleTickEvent(int delay)
     {
         if (tickEvent.squashed())
-            tickEvent.reschedule(curTick + delay);
+            tickEvent.reschedule(curTick + cycles(delay));
         else if (!tickEvent.scheduled())
-            tickEvent.schedule(curTick + delay);
+            tickEvent.schedule(curTick + cycles(delay));
     }
 
     /** Unschedule tick event, regardless of its current state. */
@@ -196,7 +197,7 @@ class FullO3CPU : public BaseFullCPU
     /** Switches out this CPU.
      *  @todo: Implement this.
      */
-    void switchOut();
+    void switchOut(Sampler *sampler);
 
     /** Takes over from another CPU.
      *  @todo: Implement this.
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 279ff556e..3f3f68247 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -107,6 +107,9 @@ class DefaultDecode
     /** Sets pointer to list of active threads. */
     void setActiveThreads(std::list<unsigned> *at_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
     /** Ticks decode, processing all input signals and decoding as many
      * instructions as possible.
      */
@@ -272,6 +275,8 @@ class DefaultDecode
     Stats::Scalar<> decodeUnblockCycles;
     /** Stat for total number of squashing cycles. */
     Stats::Scalar<> decodeSquashCycles;
+    /** Stat for number of times a branch is resolved at decode. */
+    Stats::Scalar<> decodeBranchResolved;
     /** Stat for number of times a branch mispredict is detected. */
     Stats::Scalar<> decodeBranchMispred;
     /** Stat for number of times decode detected a non-control instruction
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index f1aea27b4..caa97067b 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -66,40 +66,44 @@ void
 DefaultDecode<Impl>::regStats()
 {
     decodeIdleCycles
-        .name(name() + ".decodeIdleCycles")
+        .name(name() + ".DECODE:IdleCycles")
         .desc("Number of cycles decode is idle")
         .prereq(decodeIdleCycles);
     decodeBlockedCycles
-        .name(name() + ".decodeBlockedCycles")
+        .name(name() + ".DECODE:BlockedCycles")
         .desc("Number of cycles decode is blocked")
         .prereq(decodeBlockedCycles);
     decodeRunCycles
-        .name(name() + ".decodeRunCycles")
+        .name(name() + ".DECODE:RunCycles")
         .desc("Number of cycles decode is running")
         .prereq(decodeRunCycles);
     decodeUnblockCycles
-        .name(name() + ".decodeUnblockCycles")
+        .name(name() + ".DECODE:UnblockCycles")
         .desc("Number of cycles decode is unblocking")
         .prereq(decodeUnblockCycles);
     decodeSquashCycles
-        .name(name() + ".decodeSquashCycles")
+        .name(name() + ".DECODE:SquashCycles")
         .desc("Number of cycles decode is squashing")
         .prereq(decodeSquashCycles);
+    decodeBranchResolved
+        .name(name() + ".DECODE:BranchResolved")
+        .desc("Number of times decode resolved a branch")
+        .prereq(decodeBranchResolved);
     decodeBranchMispred
-        .name(name() + ".decodeBranchMispred")
+        .name(name() + ".DECODE:BranchMispred")
         .desc("Number of times decode detected a branch misprediction")
         .prereq(decodeBranchMispred);
     decodeControlMispred
-        .name(name() + ".decodeControlMispred")
+        .name(name() + ".DECODE:ControlMispred")
         .desc("Number of times decode detected an instruction incorrectly"
               " predicted as a control")
         .prereq(decodeControlMispred);
     decodeDecodedInsts
-        .name(name() + ".decodeDecodedInsts")
+        .name(name() + ".DECODE:DecodedInsts")
         .desc("Number of instructions handled by decode")
         .prereq(decodeDecodedInsts);
     decodeSquashedInsts
-        .name(name() + ".decodeSquashedInsts")
+        .name(name() + ".DECODE:SquashedInsts")
         .desc("Number of squashed instructions handled by decode")
         .prereq(decodeSquashedInsts);
 }
@@ -158,6 +162,33 @@ DefaultDecode<Impl>::setActiveThreads(list<unsigned> *at_ptr)
     activeThreads = at_ptr;
 }
 
+template <class Impl>
+void
+DefaultDecode<Impl>::switchOut()
+{
+}
+
+template <class Impl>
+void
+DefaultDecode<Impl>::takeOverFrom()
+{
+    _status = Inactive;
+
+    for (int i = 0; i < numThreads; ++i) {
+        decodeStatus[i] = Idle;
+
+        stalls[i].rename = false;
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        while (!insts[i].empty())
+            insts[i].pop();
+        while (!skidBuffer[i].empty())
+            skidBuffer[i].pop();
+        branchCount[i] = 0;
+    }
+    wroteToTimeBuffer = false;
+}
+
 template<class Impl>
 bool
 DefaultDecode<Impl>::checkStall(unsigned tid) const
@@ -680,6 +711,7 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid)
 
         // Go ahead and compute any PC-relative branches.
         if (inst->isDirectCtrl() && inst->isUncondCtrl()) {
+            ++decodeBranchResolved;
             inst->setNextPC(inst->branchTarget());
 
             if (inst->mispredicted()) {
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index f0b15cb86..6074831c6 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -35,6 +35,8 @@
 #include "mem/mem_interface.hh"
 #include "sim/eventq.hh"
 
+class Sampler;
+
 /**
  * DefaultFetch class handles both single threaded and SMT fetch. Its width is
  * specified by the parameters; each cycle it tries to fetch that many
@@ -81,6 +83,7 @@ class DefaultFetch
         Fetching,
         TrapPending,
         QuiescePending,
+        SwitchOut,
         IcacheMissStall,
         IcacheMissComplete
     };
@@ -160,6 +163,12 @@ class DefaultFetch
     /** Processes cache completion event. */
     void processCacheCompletion(MemReqPtr &req);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     void wakeFromQuiesce();
 
   private:
@@ -360,6 +369,8 @@ class DefaultFetch
 
     bool interruptPending;
 
+    bool switchedOut;
+
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
@@ -382,6 +393,8 @@ class DefaultFetch
      */
     Stats::Scalar<> fetchIdleCycles;
     Stats::Scalar<> fetchBlockedCycles;
+
+    Stats::Scalar<> fetchMiscStallCycles;
     /** Stat for total number of fetched cache lines. */
     Stats::Scalar<> fetchedCacheLines;
 
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 563a767df..92f923c65 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -169,53 +169,59 @@ void
 DefaultFetch<Impl>::regStats()
 {
     icacheStallCycles
-        .name(name() + ".icacheStallCycles")
+        .name(name() + ".FETCH:icacheStallCycles")
         .desc("Number of cycles fetch is stalled on an Icache miss")
         .prereq(icacheStallCycles);
 
     fetchedInsts
-        .name(name() + ".fetchedInsts")
+        .name(name() + ".FETCH:Insts")
         .desc("Number of instructions fetch has processed")
         .prereq(fetchedInsts);
 
     fetchedBranches
-        .name(name() + ".fetchedBranches")
+        .name(name() + ".FETCH:Branches")
         .desc("Number of branches that fetch encountered")
         .prereq(fetchedBranches);
 
     predictedBranches
-        .name(name() + ".predictedBranches")
+        .name(name() + ".FETCH:predictedBranches")
         .desc("Number of branches that fetch has predicted taken")
         .prereq(predictedBranches);
 
     fetchCycles
-        .name(name() + ".fetchCycles")
+        .name(name() + ".FETCH:Cycles")
         .desc("Number of cycles fetch has run and was not squashing or"
               " blocked")
         .prereq(fetchCycles);
 
     fetchSquashCycles
-        .name(name() + ".fetchSquashCycles")
+        .name(name() + ".FETCH:SquashCycles")
         .desc("Number of cycles fetch has spent squashing")
         .prereq(fetchSquashCycles);
 
     fetchIdleCycles
-        .name(name() + ".fetchIdleCycles")
+        .name(name() + ".FETCH:IdleCycles")
         .desc("Number of cycles fetch was idle")
         .prereq(fetchIdleCycles);
 
     fetchBlockedCycles
-        .name(name() + ".fetchBlockedCycles")
+        .name(name() + ".FETCH:BlockedCycles")
         .desc("Number of cycles fetch has spent blocked")
         .prereq(fetchBlockedCycles);
 
     fetchedCacheLines
-        .name(name() + ".fetchedCacheLines")
+        .name(name() + ".FETCH:CacheLines")
         .desc("Number of cache lines fetched")
         .prereq(fetchedCacheLines);
 
+    fetchMiscStallCycles
+        .name(name() + ".FETCH:MiscStallCycles")
+        .desc("Number of cycles fetch has spent waiting on interrupts, or "
+              "bad addresses, or out of MSHRs")
+        .prereq(fetchMiscStallCycles);
+
     fetchIcacheSquashes
-        .name(name() + ".fetchIcacheSquashes")
+        .name(name() + ".FETCH:IcacheSquashes")
         .desc("Number of outstanding Icache misses that were squashed")
         .prereq(fetchIcacheSquashes);
 
@@ -223,24 +229,24 @@ DefaultFetch<Impl>::regStats()
         .init(/* base value */ 0,
               /* last value */ fetchWidth,
               /* bucket size */ 1)
-        .name(name() + ".rateDist")
+        .name(name() + ".FETCH:rateDist")
         .desc("Number of instructions fetched each cycle (Total)")
         .flags(Stats::pdf);
 
     idleRate
-        .name(name() + ".idleRate")
+        .name(name() + ".FETCH:idleRate")
         .desc("Percent of cycles fetch was idle")
         .prereq(idleRate);
     idleRate = fetchIdleCycles * 100 / cpu->numCycles;
 
     branchRate
-        .name(name() + ".branchRate")
+        .name(name() + ".FETCH:branchRate")
         .desc("Number of branch fetches per cycle")
         .flags(Stats::total);
     branchRate = predictedBranches / cpu->numCycles;
 
     fetchRate
-        .name(name() + ".rate")
+        .name(name() + ".FETCH:rate")
         .desc("Number of inst fetches per cycle")
         .flags(Stats::total);
     fetchRate = fetchedInsts / cpu->numCycles;
@@ -332,7 +338,8 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
     // Can keep track of how many cache accesses go unused due to
     // misspeculation here.
     if (fetchStatus[tid] != IcacheMissStall ||
-        req != memReq[tid]) {
+        req != memReq[tid] ||
+        isSwitchedOut()) {
         ++fetchIcacheSquashes;
         return;
     }
@@ -360,6 +367,35 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
 //    memReq[tid]->completionEvent = NULL;
 }
 
+template <class Impl>
+void
+DefaultFetch<Impl>::switchOut()
+{
+    switchedOut = true;
+    branchPred.switchOut();
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::takeOverFrom()
+{
+    // Reset all state
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        stalls[i].decode = 0;
+        stalls[i].rename = 0;
+        stalls[i].iew = 0;
+        stalls[i].commit = 0;
+        PC[i] = cpu->readPC(i);
+        nextPC[i] = cpu->readNextPC(i);
+        fetchStatus[i] = Running;
+    }
+    numInst = 0;
+    wroteToTimeBuffer = false;
+    _status = Inactive;
+    switchedOut = false;
+    branchPred.takeOverFrom();
+}
+
 template <class Impl>
 void
 DefaultFetch<Impl>::wakeFromQuiesce()
@@ -902,8 +938,10 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                 tid, fetch_PC);
 
         bool fetch_success = fetchCacheLine(fetch_PC, fault, tid);
-        if (!fetch_success)
+        if (!fetch_success) {
+            ++fetchMiscStallCycles;
             return;
+        }
     } else {
         if (fetchStatus[tid] == Idle) {
             ++fetchIdleCycles;
diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
index 9b6ac15d9..cb7a15061 100644
--- a/cpu/o3/fu_pool.cc
+++ b/cpu/o3/fu_pool.cc
@@ -242,6 +242,20 @@ FUPool::dump()
     }
 }
 
+void
+FUPool::switchOut()
+{
+}
+
+void
+FUPool::takeOverFrom()
+{
+    for (int i = 0; i < numFU; i++) {
+        unitBusy[i] = false;
+    }
+    unitsToBeFreed.clear();
+}
+
 //
 
 ////////////////////////////////////////////////////////////////////////////
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
index d7b7acadb..7df5ad5f3 100644
--- a/cpu/o3/fu_pool.hh
+++ b/cpu/o3/fu_pool.hh
@@ -154,6 +154,9 @@ class FUPool : public SimObject
     unsigned getIssueLatency(OpClass capability) {
         return maxIssueLatencies[capability];
     }
+
+    void switchOut();
+    void takeOverFrom();
 };
 
 #endif // __CPU_O3_FU_POOL_HH__
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 58cd68b21..ae0ba6a21 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -157,6 +157,12 @@ class DefaultIEW
     /** Sets pointer to the scoreboard. */
     void setScoreboard(Scoreboard *sb_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Sets page table pointer within LSQ. */
 //    void setPageTable(PageTable *pt_ptr);
 
@@ -420,6 +426,8 @@ class DefaultIEW
     /** Maximum size of the skid buffer. */
     unsigned skidBufferMax;
 
+    bool switchedOut;
+
     /** Stat for total number of idle cycles. */
     Stats::Scalar<> iewIdleCycles;
     /** Stat for total number of squashing cycles. */
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 2ae2e1361..42d83ee72 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -55,13 +55,13 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
 
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
-    iewStage->wakeCPU();
-
-    if (inst->isSquashed()) {
+    if (inst->isSquashed() || iewStage->isSwitchedOut()) {
         inst = NULL;
         return;
     }
 
+    iewStage->wakeCPU();
+
     if (!inst->isExecuted()) {
         inst->setExecuted();
 
@@ -101,7 +101,8 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
       issueReadWidth(params->issueWidth),
       issueWidth(params->issueWidth),
       executeWidth(params->executeWidth),
-      numThreads(params->numberOfThreads)
+      numThreads(params->numberOfThreads),
+      switchedOut(false)
 {
     DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
     _status = Active;
@@ -436,6 +437,53 @@ DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template <class Impl>
+void
+DefaultIEW<Impl>::switchOut()
+{
+    switchedOut = true;
+    instQueue.switchOut();
+    ldstQueue.switchOut();
+    fuPool->switchOut();
+
+    for (int i = 0; i < numThreads; i++) {
+        while (!insts[i].empty())
+            insts[i].pop();
+        while (!skidBuffer[i].empty())
+            skidBuffer[i].pop();
+    }
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::takeOverFrom()
+{
+    _status = Active;
+    exeStatus = Running;
+    wbStatus = Idle;
+    switchedOut = false;
+
+    instQueue.takeOverFrom();
+    ldstQueue.takeOverFrom();
+    fuPool->takeOverFrom();
+
+    initStage();
+    cpu->activityThisCycle();
+
+    for (int i=0; i < numThreads; i++) {
+        dispatchStatus[i] = Running;
+        stalls[i].commit = false;
+        fetchRedirect[i] = false;
+    }
+
+    updateLSQNextCycle = false;
+
+    // @todo: Fix hardcoded number
+    for (int i = 0; i < 6; ++i) {
+        issueToExecQueue.advance();
+    }
+}
+
 template<class Impl>
 void
 DefaultIEW<Impl>::squash(unsigned tid)
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 06d9937f2..982294b4f 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -112,6 +112,10 @@ class InstructionQueue
     /** Registers statistics. */
     void regStats();
 
+    void resetState();
+
+    void resetDependencyGraph();
+
     /** Sets CPU pointer. */
     void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
@@ -127,6 +131,12 @@ class InstructionQueue
     /** Sets the global time buffer. */
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Number of entries needed for given amount of threads. */
     int entryAmount(int num_threads);
 
@@ -385,6 +395,8 @@ class InstructionQueue
      */
     unsigned commitToIEWDelay;
 
+    bool switchedOut;
+
     //////////////////////////////////
     // Variables needed for squashing
     //////////////////////////////////
@@ -507,7 +519,7 @@ class InstructionQueue
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
     Stats::VectorDistribution<> queue_res_dist;
-    Stats::Vector<> n_issued_dist;
+    Stats::Distribution<> n_issued_dist;
     Stats::VectorDistribution<> issue_delay_dist;
 
     Stats::Vector<> stat_fu_busy;
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 804bc2472..0d9cc09f3 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -82,15 +82,9 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 {
     assert(fuPool);
 
-    numThreads = params->numberOfThreads;
+    switchedOut = false;
 
-    //Initialize thread IQ counts
-    for (int i = 0; i <numThreads; i++) {
-        count[i] = 0;
-    }
-
-    // Initialize the number of free IQ entries.
-    freeEntries = numEntries;
+    numThreads = params->numberOfThreads;
 
     // Set the number of physical registers as the number of int + float
     numPhysRegs = numPhysIntRegs + numPhysFloatRegs;
@@ -101,37 +95,24 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
     //dependency graph.
     dependGraph = new DependencyEntry[numPhysRegs];
 
-    // Resize the register scoreboard.
-    regScoreboard.resize(numPhysRegs);
-
-    //Initialize Mem Dependence Units
-    for (int i = 0; i < numThreads; i++) {
-        memDepUnit[i].init(params,i);
-        memDepUnit[i].setIQ(this);
-    }
-
     // Initialize all the head pointers to point to NULL, and all the
     // entries as unready.
-    // Note that in actuality, the registers corresponding to the logical
-    // registers start off as ready.  However this doesn't matter for the
-    // IQ as the instruction should have been correctly told if those
-    // registers are ready in rename.  Thus it can all be initialized as
-    // unready.
     for (int i = 0; i < numPhysRegs; ++i) {
         dependGraph[i].next = NULL;
         dependGraph[i].inst = NULL;
-        regScoreboard[i] = false;
     }
 
-    for (int i = 0; i < numThreads; ++i) {
-        squashedSeqNum[i] = 0;
-    }
+    // Resize the register scoreboard.
+    regScoreboard.resize(numPhysRegs);
 
-    for (int i = 0; i < Num_OpClasses; ++i) {
-        queueOnList[i] = false;
-        readyIt[i] = listOrder.end();
+    //Initialize Mem Dependence Units
+    for (int i = 0; i < numThreads; i++) {
+        memDepUnit[i].init(params,i);
+        memDepUnit[i].setIQ(this);
     }
 
+    resetState();
+
     string policy = params->smtIQPolicy;
 
     //Convert string to lowercase
@@ -184,30 +165,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 template <class Impl>
 InstructionQueue<Impl>::~InstructionQueue()
 {
-    // Clear the dependency graph
-    DependencyEntry *curr;
-    DependencyEntry *prev;
-
-    for (int i = 0; i < numPhysRegs; ++i) {
-        curr = dependGraph[i].next;
-
-        while (curr) {
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
-
-            delete prev;
-        }
-
-        if (dependGraph[i].inst) {
-            dependGraph[i].inst = NULL;
-        }
-
-        dependGraph[i].next = NULL;
-    }
-
+    resetDependencyGraph();
     assert(DependencyEntry::mem_alloc_counter == 0);
 
     delete [] dependGraph;
@@ -307,10 +265,10 @@ InstructionQueue<Impl>::regStats()
         queue_res_dist.subname(i, opClassStrings[i]);
     }
     n_issued_dist
-        .init(totalWidth + 1)
+        .init(0,totalWidth,1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
-        .flags(total | pdf | dist)
+        .flags(pdf)
         ;
 /*
     dist_unissued
@@ -400,6 +358,71 @@ InstructionQueue<Impl>::regStats()
     }
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::resetState()
+{
+    //Initialize thread IQ counts
+    for (int i = 0; i <numThreads; i++) {
+        count[i] = 0;
+        instList[i].clear();
+    }
+
+    // Initialize the number of free IQ entries.
+    freeEntries = numEntries;
+
+    // Note that in actuality, the registers corresponding to the logical
+    // registers start off as ready.  However this doesn't matter for the
+    // IQ as the instruction should have been correctly told if those
+    // registers are ready in rename.  Thus it can all be initialized as
+    // unready.
+    for (int i = 0; i < numPhysRegs; ++i) {
+        regScoreboard[i] = false;
+    }
+
+    for (int i = 0; i < numThreads; ++i) {
+        squashedSeqNum[i] = 0;
+    }
+
+    for (int i = 0; i < Num_OpClasses; ++i) {
+        while (!readyInsts[i].empty())
+            readyInsts[i].pop();
+        queueOnList[i] = false;
+        readyIt[i] = listOrder.end();
+    }
+    nonSpecInsts.clear();
+    listOrder.clear();
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::resetDependencyGraph()
+{
+    // Clear the dependency graph
+    DependencyEntry *curr;
+    DependencyEntry *prev;
+
+    for (int i = 0; i < numPhysRegs; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            DependencyEntry::mem_alloc_counter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
@@ -426,6 +449,25 @@ InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     fromCommit = timeBuffer->getWire(-commitToIEWDelay);
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::switchOut()
+{
+    resetState();
+    resetDependencyGraph();
+    switchedOut = true;
+    for (int i = 0; i < numThreads; ++i) {
+        memDepUnit[i].switchOut();
+    }
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::takeOverFrom()
+{
+    switchedOut = false;
+}
+
 template <class Impl>
 int
 InstructionQueue<Impl>::entryAmount(int num_threads)
@@ -685,6 +727,10 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
 {
     // The CPU could have been sleeping until this op completed (*extremely*
     // long latency op).  Wake it if it was.  This may be overkill.
+    if (isSwitchedOut()) {
+        return;
+    }
+
     iewStage->wakeCPU();
 
     fuPool->freeUnit(fu_idx);
@@ -816,7 +862,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
                     FUCompletion *execution = new FUCompletion(issuing_inst,
                                                                idx, this);
 
-                    execution->schedule(curTick + issue_latency - 1);
+                    execution->schedule(curTick + cpu->cycles(issue_latency - 1));
                 } else {
                     i2e_info->insts[exec_queue_slot++] = issuing_inst;
                     i2e_info->size++;
@@ -862,6 +908,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
         }
     }
 
+    n_issued_dist.sample(total_issued);
+
     if (total_issued) {
         cpu->activityThisCycle();
     } else {
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index c59b5f13b..d5f893e57 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -71,6 +71,9 @@ class LSQ {
     /** Sets the page table pointer. */
 //    void setPageTable(PageTable *pt_ptr);
 
+    void switchOut();
+    void takeOverFrom();
+
     /** Number of entries needed for the given amount of threads.*/
     int entryAmount(int num_threads);
     void removeEntries(unsigned tid);
@@ -271,15 +274,6 @@ class LSQ {
     /** Max SQ Size - Used to Enforce Sharing Policies. */
     unsigned maxSQEntries;
 
-    /** Global Load Count. */
-    int loads;
-
-    /** Global Store Count */
-    int stores;
-
-    /** Global Store To WB Count */
-    int storesToWB;
-
     /** Number of Threads. */
     unsigned numThreads;
 };
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index 523517869..c43c19619 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -33,7 +33,6 @@ using namespace std;
 template <class Impl>
 LSQ<Impl>::LSQ(Params *params)
     : LQEntries(params->LQEntries), SQEntries(params->SQEntries),
-      loads(0), stores(0), storesToWB(0),
       numThreads(params->numberOfThreads)
 {
     DPRINTF(LSQ, "Creating LSQ object.\n");
@@ -143,6 +142,24 @@ LSQ<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template <class Impl>
+void
+LSQ<Impl>::switchOut()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        thread[tid].switchOut();
+    }
+}
+
+template <class Impl>
+void
+LSQ<Impl>::takeOverFrom()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        thread[tid].takeOverFrom();
+    }
+}
+
 template <class Impl>
 int
 LSQ<Impl>::entryAmount(int num_threads)
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index ba8b1d2e2..d17efe96a 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -38,6 +38,7 @@
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
+#include "sim/debug.hh"
 #include "sim/sim_object.hh"
 #include "arch/faults.hh"
 
@@ -110,6 +111,12 @@ class LSQUnit {
     /** Sets the page table pointer. */
 //    void setPageTable(PageTable *pt_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
+    bool isSwitchedOut() { return switchedOut; }
+
     /** Ticks the LSQ unit, which in this case only resets the number of
      * used cache ports.
      * @todo: Move the number of used ports up to the LSQ level so it can
@@ -278,20 +285,20 @@ class LSQUnit {
         /** Whether or not the store is completed. */
         bool completed;
     };
-
+/*
     enum Status {
         Running,
         Idle,
         DcacheMissStall,
         DcacheMissSwitch
     };
-
+*/
   private:
     /** The LSQUnit thread id. */
     unsigned lsqID;
 
     /** The status of the LSQ unit. */
-    Status _status;
+//    Status _status;
 
     /** The store queue. */
     std::vector<SQEntry> storeQueue;
@@ -335,6 +342,8 @@ class LSQUnit {
     /** The number of used cache ports in this cycle. */
     int usedPorts;
 
+    bool switchedOut;
+
     //list<InstSeqNum> mshrSeqNums;
 
      //Stats::Scalar<> dcacheStallCycles;
@@ -373,7 +382,25 @@ class LSQUnit {
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
+/*
+    // total number of loads forwaded from LSQ stores
+    Stats::Vector<> lsq_forw_loads;
+
+    // total number of loads ignored due to invalid addresses
+    Stats::Vector<> inv_addr_loads;
+
+    // total number of software prefetches ignored due to invalid addresses
+    Stats::Vector<> inv_addr_swpfs;
+
+    // total non-speculative bogus addresses seen (debug var)
+    Counter sim_invalid_addrs;
+    Stats::Vector<> fu_busy;  //cumulative fu busy
 
+    // ready loads blocked due to memory disambiguation
+    Stats::Vector<> lsq_blocked_loads;
+
+    Stats::Scalar<> lsqInversion;
+*/
   public:
     /** Executes the load at the given index. */
     template <class T>
@@ -590,7 +617,12 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-
+/*
+        Addr debug_addr = ULL(0xfffffc0000be81a8);
+        if (req->vaddr == debug_addr) {
+            debug_break();
+        }
+*/
         assert(!req->completionEvent);
         req->completionEvent =
             new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
@@ -608,7 +640,7 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             lastDcacheStall = curTick;
 
-            _status = DcacheMissStall;
+//            _status = DcacheMissStall;
 
         } else {
             DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
@@ -694,7 +726,12 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
     storeQueue[store_idx].data = data;
-
+/*
+    Addr debug_addr = ULL(0xfffffc0000be81a8);
+    if (req->vaddr == debug_addr) {
+        debug_break();
+    }
+*/
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index d9a118b0e..c5ce34c70 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -50,6 +50,9 @@ LSQUnit<Impl>::StoreCompletionEvent::process()
 
     //lsqPtr->removeMSHR(lsqPtr->storeQueue[storeIdx].inst->seqNum);
 
+    if (lsqPtr->isSwitchedOut())
+        return;
+
     lsqPtr->cpu->wakeCPU();
     if (wbEvent)
         wbEvent->process();
@@ -78,6 +81,8 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 {
     DPRINTF(LSQUnit, "Creating LSQUnit%i object.\n",id);
 
+    switchedOut = false;
+
     lsqID = id;
 
     LQEntries = maxLQEntries;
@@ -138,6 +143,89 @@ LSQUnit<Impl>::setPageTable(PageTable *pt_ptr)
 }
 #endif
 
+template<class Impl>
+void
+LSQUnit<Impl>::switchOut()
+{
+    switchedOut = true;
+    for (int i = 0; i < loadQueue.size(); ++i)
+        loadQueue[i] = NULL;
+
+    while (storesToWB > 0 &&
+           storeWBIdx != storeTail &&
+           storeQueue[storeWBIdx].inst &&
+           storeQueue[storeWBIdx].canWB) {
+
+        if (storeQueue[storeWBIdx].size == 0 ||
+            storeQueue[storeWBIdx].inst->isDataPrefetch() ||
+            storeQueue[storeWBIdx].committed ||
+            storeQueue[storeWBIdx].req->flags & LOCKED) {
+            incrStIdx(storeWBIdx);
+
+            continue;
+        }
+
+        assert(storeQueue[storeWBIdx].req);
+        assert(!storeQueue[storeWBIdx].committed);
+
+        MemReqPtr req = storeQueue[storeWBIdx].req;
+        storeQueue[storeWBIdx].committed = true;
+
+        req->cmd = Write;
+        req->completionEvent = NULL;
+        req->time = curTick;
+        assert(!req->data);
+        req->data = new uint8_t[64];
+        memcpy(req->data, (uint8_t *)&storeQueue[storeWBIdx].data, req->size);
+
+        DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                storeWBIdx,storeQueue[storeWBIdx].inst->readPC(),
+                req->paddr, *(req->data),
+                storeQueue[storeWBIdx].inst->seqNum);
+
+        switch(storeQueue[storeWBIdx].size) {
+          case 1:
+            cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 2:
+            cpu->write(req, (uint16_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 4:
+            cpu->write(req, (uint32_t &)storeQueue[storeWBIdx].data);
+            break;
+          case 8:
+            cpu->write(req, (uint64_t &)storeQueue[storeWBIdx].data);
+            break;
+          default:
+            panic("Unexpected store size!\n");
+        }
+        incrStIdx(storeWBIdx);
+    }
+}
+
+template<class Impl>
+void
+LSQUnit<Impl>::takeOverFrom()
+{
+    switchedOut = false;
+    loads = stores = storesToWB = 0;
+
+    loadHead = loadTail = 0;
+
+    storeHead = storeWBIdx = storeTail = 0;
+
+    usedPorts = 0;
+
+    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+
+    blockedLoadSeqNum = 0;
+
+    stalled = false;
+    isLoadBlocked = false;
+    loadBlockedHandled = false;
+}
+
 template<class Impl>
 void
 LSQUnit<Impl>::resizeLQ(unsigned size)
@@ -647,7 +735,7 @@ LSQUnit<Impl>::writebackStores()
 
                 lastDcacheStall = curTick;
 
-                _status = DcacheMissStall;
+//                _status = DcacheMissStall;
 
                 //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
 
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index 32ce9f768..141e0fdc4 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -84,6 +84,10 @@ class MemDepUnit {
     /** Registers statistics. */
     void regStats();
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Sets the pointer to the IQ. */
     void setIQ(InstructionQueue<Impl> *iq_ptr);
 
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 771a0505e..05a33685d 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -101,6 +101,26 @@ MemDepUnit<MemDepPred, Impl>::regStats()
         .desc("Number of conflicting stores.");
 }
 
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::switchOut()
+{
+    for (int i = 0; i < Impl::MaxThreads; ++i) {
+        instList[i].clear();
+    }
+    instsToReplay.clear();
+    memDepHash.clear();
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::takeOverFrom()
+{
+    loadBarrier = storeBarrier = false;
+    loadBarrierSN = storeBarrierSN = 0;
+    depPred.clear();
+}
+
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)
diff --git a/cpu/o3/ras.cc b/cpu/o3/ras.cc
index 5e7ef38ae..0b3ea4918 100644
--- a/cpu/o3/ras.cc
+++ b/cpu/o3/ras.cc
@@ -41,6 +41,15 @@ ReturnAddrStack::init(unsigned _numEntries)
          addrStack[i] = 0;
 }
 
+void
+ReturnAddrStack::reset()
+{
+    usedEntries = 0;
+    tos = 0;
+    for (int i = 0; i < numEntries; ++i)
+        addrStack[i] = 0;
+}
+
 void
 ReturnAddrStack::push(const Addr &return_addr)
 {
diff --git a/cpu/o3/ras.hh b/cpu/o3/ras.hh
index 5aa4fc05f..27e7c2df4 100644
--- a/cpu/o3/ras.hh
+++ b/cpu/o3/ras.hh
@@ -47,6 +47,8 @@ class ReturnAddrStack
      */
     void init(unsigned numEntries);
 
+    void reset();
+
     /** Returns the top address on the RAS. */
     Addr top()
     { return addrStack[tos]; }
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index c6f8f97aa..4c5c46356 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -153,6 +153,10 @@ class DefaultRename
     /** Sets pointer to the scoreboard. */
     void setScoreboard(Scoreboard *_scoreboard);
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Squashes all instructions in a thread. */
     void squash(unsigned tid);
 
@@ -448,6 +452,7 @@ class DefaultRename
     Stats::Scalar<> renameUndoneMaps;
     Stats::Scalar<> renamedSerializing;
     Stats::Scalar<> renamedTempSerializing;
+    Stats::Scalar<> renameSkidInsts;
 };
 
 #endif // __CPU_O3_RENAME_HH__
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index e29211921..d41058deb 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -151,6 +151,11 @@ DefaultRename<Impl>::regStats()
         .desc("count of temporary serializing insts renamed")
         .flags(Stats::total)
         ;
+    renameSkidInsts
+        .name(name() + ".RENAME:skidInsts")
+        .desc("count of insts added to the skid buffer")
+        .flags(Stats::total)
+        ;
 }
 
 template <class Impl>
@@ -213,8 +218,8 @@ DefaultRename<Impl>::initStage()
 
     // Clear these pointers so they are not accidentally used in
     // non-initialization code.
-    iew_ptr = NULL;
-    commit_ptr = NULL;
+//    iew_ptr = NULL;
+//    commit_ptr = NULL;
 }
 
 template<class Impl>
@@ -253,6 +258,55 @@ DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
     scoreboard = _scoreboard;
 }
 
+template <class Impl>
+void
+DefaultRename<Impl>::switchOut()
+{
+    for (int i = 0; i < numThreads; i++) {
+        typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin();
+
+        while (!historyBuffer[i].empty()) {
+            assert(hb_it != historyBuffer[i].end());
+
+            DPRINTF(Rename, "[tid:%u]: Removing history entry with sequence "
+                    "number %i.\n", i, (*hb_it).instSeqNum);
+
+            // Tell the rename map to set the architected register to the
+            // previous physical register that it was renamed to.
+            renameMap[i]->setEntry(hb_it->archReg, hb_it->prevPhysReg);
+
+            // Put the renamed physical register back on the free list.
+            freeList->addReg(hb_it->newPhysReg);
+
+            historyBuffer[i].erase(hb_it++);
+        }
+        insts[i].clear();
+        skidBuffer[i].clear();
+    }
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::takeOverFrom()
+{
+    _status = Inactive;
+    initStage();
+
+    for (int i=0; i< numThreads; i++) {
+        renameStatus[i] = Idle;
+
+        stalls[i].iew = false;
+        stalls[i].commit = false;
+        serializeInst[i] = NULL;
+
+        instsInProgress[i] = 0;
+
+        emptyROB[i] = true;
+
+        serializeOnNextInst[i] = false;
+    }
+}
+
 template <class Impl>
 void
 DefaultRename<Impl>::squash(unsigned tid)
@@ -393,7 +447,7 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
     } else if (renameStatus[tid] == Unblocking) {
         renameInsts(tid);
 
-        ++renameUnblockCycles;
+//        ++renameUnblockCycles;
 
         if (validInsts()) {
             // Add the current inputs to the skid buffer so they can be
@@ -564,6 +618,8 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize after instruction encountered.\n");
 
+            renamedSerializing++;
+
             inst->setSerializeHandled();
 
             serializeAfter(insts_to_rename, tid);
@@ -594,13 +650,12 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         // Increment which instruction we're on.
         ++toIEWIndex;
 
-        ++renameRenamedInsts;
-
         // Decrement how many instructions are available.
         --insts_available;
     }
 
     instsInProgress[tid] += renamed_insts;
+    renameRenamedInsts += renamed_insts;
 
     // If we wrote to the time buffer, record this.
     if (toIEWIndex) {
@@ -635,6 +690,8 @@ DefaultRename<Impl>::skidInsert(unsigned tid)
         DPRINTF(Rename, "[tid:%u]: Inserting [sn:%lli] PC:%#x into Rename "
                 "skidBuffer\n", tid, inst->seqNum, inst->readPC());
 
+        ++renameSkidInsts;
+
         skidBuffer[tid].push_back(inst);
     }
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 48199915f..0748850ea 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -97,6 +97,10 @@ class ROB
      */
     void setActiveThreads(std::list<unsigned>* at_ptr);
 
+    void switchOut();
+
+    void takeOverFrom();
+
     /** Function to insert an instruction into the ROB. Note that whatever
      *  calls this function must ensure that there is enough space within the
      *  ROB for the new instruction.
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 96d907cda..02a4bfbee 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -121,6 +121,31 @@ ROB<Impl>::setActiveThreads(list<unsigned> *at_ptr)
     activeThreads = at_ptr;
 }
 
+template <class Impl>
+void
+ROB<Impl>::switchOut()
+{
+    for (int tid = 0; tid < numThreads; tid++) {
+        instList[tid].clear();
+    }
+}
+
+template <class Impl>
+void
+ROB<Impl>::takeOverFrom()
+{
+    for (int tid=0; tid  < numThreads; tid++) {
+        doneSquashing[tid] = true;
+        threadEntries[tid] = 0;
+        squashIt[tid] = instList[tid].end();
+    }
+    numInstsInROB = 0;
+
+    // Initialize the "universal" ROB head & tail point to invalid
+    // pointers
+    head = instList[0].end();
+    tail = instList[0].end();
+}
 
 template <class Impl>
 void
diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
index a6e131483..b481b4ad2 100644
--- a/cpu/o3/sat_counter.cc
+++ b/cpu/o3/sat_counter.cc
@@ -30,17 +30,17 @@
 #include "cpu/o3/sat_counter.hh"
 
 SatCounter::SatCounter()
-    : maxVal(0), counter(0)
+    : initialVal(0), counter(0)
 {
 }
 
 SatCounter::SatCounter(unsigned bits)
-    : maxVal((1 << bits) - 1), counter(0)
+    : initialVal(0), maxVal((1 << bits) - 1), counter(0)
 {
 }
 
-SatCounter::SatCounter(unsigned bits, unsigned initial_val)
-    : maxVal((1 << bits) - 1), counter(initial_val)
+SatCounter::SatCounter(unsigned bits, uint8_t initial_val)
+    : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
 {
     // Check to make sure initial value doesn't exceed the max counter value.
     if (initial_val > maxVal) {
@@ -53,19 +53,3 @@ SatCounter::setBits(unsigned bits)
 {
     maxVal = (1 << bits) - 1;
 }
-
-void
-SatCounter::increment()
-{
-    if (counter < maxVal) {
-        ++counter;
-    }
-}
-
-void
-SatCounter::decrement()
-{
-    if (counter > 0) {
-        --counter;
-    }
-}
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index 952f1f86d..1d20a8a8f 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -57,22 +57,34 @@ class SatCounter
      * @param bits How many bits the counter will have.
      * @param initial_val Starting value for each counter.
      */
-    SatCounter(unsigned bits, unsigned initial_val);
+    SatCounter(unsigned bits, uint8_t initial_val);
 
     /**
      * Sets the number of bits.
      */
     void setBits(unsigned bits);
 
+    void reset() { counter = initialVal; }
+
     /**
      * Increments the counter's current value.
      */
-    void increment();
+    void increment()
+    {
+        if (counter < maxVal) {
+            ++counter;
+        }
+    }
 
     /**
      * Decrements the counter's current value.
      */
-    void decrement();
+    void decrement()
+    {
+        if (counter > 0) {
+            --counter;
+        }
+    }
 
     /**
      * Read the counter's value.
@@ -81,6 +93,7 @@ class SatCounter
     { return counter; }
 
   private:
+    uint8_t initialVal;
     uint8_t maxVal;
     uint8_t counter;
 };
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
index 846f44176..17719bdeb 100644
--- a/cpu/o3/thread_state.hh
+++ b/cpu/o3/thread_state.hh
@@ -60,7 +60,7 @@ struct O3ThreadState : public ThreadState {
     { }
 #else
     O3ThreadState(FullCPU *_cpu, int _thread_num, Process *_process, int _asid)
-        : ThreadState(-1, _thread_num, NULL, _process, _asid),
+        : ThreadState(-1, _thread_num, _process->getMemory(), _process, _asid),
           cpu(_cpu), inSyscall(0), trapPending(0)
     { }
 
-- 
cgit v1.2.3


From 8a9416ef8df05c24231a063680f61d2313cf5c32 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 11 May 2006 15:39:02 -0400
Subject: Small fixes to O3 model.

cpu/o3/alpha_dyn_inst.hh:
    Set the instResult using a function on the base dyn inst.
cpu/o3/bpred_unit_impl.hh:
    Don't need to reset the state.
cpu/o3/commit_impl.hh:
    Mark instructions as completed.

    Wait until all stores are written back to handle a fault.
cpu/o3/cpu.cc:
    Clear instruction lists when switching out.
cpu/o3/lsq_unit.hh:
    Allow wbEvent to be set externally.
cpu/o3/lsq_unit_impl.hh:
    Mark instructions as completed properly.  Also use events for writing back stores even if there is a hit in the dcache.

--HG--
extra : convert_revision : 172ad088b75ac31e848a5040633152b5c051444c
---
 cpu/o3/alpha_dyn_inst.hh  |  8 ++++----
 cpu/o3/bpred_unit_impl.hh |  2 ++
 cpu/o3/commit_impl.hh     |  9 +++++++++
 cpu/o3/cpu.cc             |  6 ++++++
 cpu/o3/lsq_unit.hh        |  2 ++
 cpu/o3/lsq_unit_impl.hh   | 39 +++++++++++++++------------------------
 6 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index e0b73f17e..24774bd0a 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -183,25 +183,25 @@ class AlphaDynInst : public BaseDynInst<Impl>
     void setIntReg(const StaticInst *si, int idx, uint64_t val)
     {
         this->cpu->setIntReg(_destRegIdx[idx], val);
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setIntReg(si, idx, val);
     }
 
     void setFloatRegSingle(const StaticInst *si, int idx, float val)
     {
         this->cpu->setFloatRegSingle(_destRegIdx[idx], val);
-        this->instResult.fp = val;
+        BaseDynInst<Impl>::setFloatRegSingle(si, idx, val);
     }
 
     void setFloatRegDouble(const StaticInst *si, int idx, double val)
     {
         this->cpu->setFloatRegDouble(_destRegIdx[idx], val);
-        this->instResult.dbl = val;
+        BaseDynInst<Impl>::setFloatRegDouble(si, idx, val);
     }
 
     void setFloatRegInt(const StaticInst *si, int idx, uint64_t val)
     {
         this->cpu->setFloatRegInt(_destRegIdx[idx], val);
-        this->instResult.integer = val;
+        BaseDynInst<Impl>::setFloatRegInt(si, idx, val);
     }
 
     /** Returns the physical register index of the i'th destination
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index 872c0c62e..d20b31e55 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -107,11 +107,13 @@ template <class Impl>
 void
 TwobitBPredUnit<Impl>::takeOverFrom()
 {
+/*
     for (int i = 0; i < Impl::MaxThreads; ++i)
         RAS[i].reset();
 
     BP.reset();
     BTB.reset();
+*/
 }
 
 template <class Impl>
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 7834460e2..034565f90 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -1117,6 +1117,10 @@ head_inst->isWriteBarrier())*/
         panic("Barrier instructions are not handled yet.\n");
     }
 
+    if (!head_inst->isStore()) {
+        head_inst->setCompleted();
+    }
+
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
@@ -1126,6 +1130,11 @@ head_inst->isWriteBarrier())*/
             DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
                     head_inst->seqNum, head_inst->readPC());
 
+            if (iewStage->hasStoresToWB()) {
+                DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
+                return false;
+            }
+
             assert(!thread[tid]->inSyscall);
 
             thread[tid]->inSyscall = true;
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index fc8372026..59308d6a9 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -666,6 +666,12 @@ FullO3CPU<Impl>::switchOut(Sampler *sampler)
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+
+    instList.clear();
+    while (!removeList.empty()) {
+        removeList.pop();
+    }
+
     if (tickEvent.scheduled())
         tickEvent.squash();
     sampler->signalSwitched();
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index d17efe96a..623dbdb4b 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -82,7 +82,9 @@ class LSQUnit {
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
+      public:
         Event *wbEvent;
+      private:
         /** The pointer to the LSQ unit that issued the store. */
         LSQUnit<Impl> *lsqPtr;
     };
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index c5ce34c70..3bb9a81f8 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -672,11 +672,6 @@ LSQUnit<Impl>::writebackStores()
                 req->paddr, *(req->data),
                 storeQueue[storeWBIdx].inst->seqNum);
 
-//        if (fault != NoFault) {
-            //What should we do if there is a fault???
-            //for now panic
-//            panic("Page Table Fault!!!!!\n");
-//        }
         switch(storeQueue[storeWBIdx].size) {
           case 1:
             cpu->write(req, (uint8_t &)storeQueue[storeWBIdx].data);
@@ -693,8 +688,16 @@ LSQUnit<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+        if (!(req->flags & LOCKED)) {
+            storeQueue[storeWBIdx].inst->setCompleted();
+        }
 
         if (dcacheInterface) {
+            assert(!req->completionEvent);
+            StoreCompletionEvent *store_event = new
+                StoreCompletionEvent(storeWBIdx, NULL, this);
+            req->completionEvent = store_event;
+
             MemAccessResult result = dcacheInterface->access(req);
 
             if (isStalled() &&
@@ -710,16 +713,12 @@ LSQUnit<Impl>::writebackStores()
             if (result != MA_HIT && dcacheInterface->doEvents()) {
                 typename IEW::LdWritebackEvent *wb = NULL;
                 if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*
-                    if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
-                        req->result=1;
-                    } else {
-                        req->result = 0;
-                    }
-*/
-                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
-                                                            iewStage);
+                    // Stx_C should not generate a system port transaction,
+                    // but that might be hard to accomplish.
+                    wb = new typename
+                        IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+                                              iewStage);
+                    store_event->wbEvent = wb;
                 }
 
                 DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
@@ -727,12 +726,6 @@ LSQUnit<Impl>::writebackStores()
                 DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
 
-                // Will stores need their own kind of writeback events?
-                // Do stores even need writeback events?
-                assert(!req->completionEvent);
-                req->completionEvent = new
-                    StoreCompletionEvent(storeWBIdx, wb, this);
-
                 lastDcacheStall = curTick;
 
 //                _status = DcacheMissStall;
@@ -766,10 +759,8 @@ LSQUnit<Impl>::writebackStores()
                     typename IEW::LdWritebackEvent *wb =
                         new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
                                                            iewStage);
-                    wb->schedule(curTick);
+                    store_event->wbEvent = wb;
                 }
-
-                completeStore(storeWBIdx);
             }
 
             incrStIdx(storeWBIdx);
-- 
cgit v1.2.3


From ef6e2eb3c4dbf337df7380ae93360c13140f11f6 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 16 May 2006 14:06:35 -0400
Subject: Updates for sampler, checker, and general correctness.

cpu/o3/alpha_cpu.hh:
    Update for sampler to work properly.  Also code cleanup.
cpu/o3/alpha_cpu_builder.cc:
cpu/o3/alpha_dyn_inst.hh:
    Updates to support the checker.
cpu/o3/alpha_cpu_impl.hh:
    Updates to support the checker.  Also general code cleanup.
cpu/o3/alpha_dyn_inst_impl.hh:
    Code cleanup.
cpu/o3/alpha_params.hh:
    Updates to support the checker.  Also supports trap latencies set through the parameters.
cpu/o3/commit.hh:
    Supports sampler, checker.  Code cleanup.
cpu/o3/commit_impl.hh:
    Updates to support the sampler and checker, as well as general code cleanup.
cpu/o3/cpu.cc:
cpu/o3/cpu.hh:
    Support sampler and checker.
cpu/o3/decode_impl.hh:
    Supports sampler.
cpu/o3/fetch.hh:
    Supports sampler.  Also update to hold the youngest valid SN fetch has seen to ensure that the entire pipeline has been drained.
cpu/o3/fetch_impl.hh:
    Sampler updates.  Also be sure to not fetches to uncached space (bad path).
cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
    Sampler updates.
cpu/o3/lsq_unit_impl.hh:
    Supports checker.
cpu/o3/regfile.hh:
    No need for accessing xcProxies directly.
cpu/o3/rename.hh:
cpu/o3/rename_impl.hh:
    Sampler support.

--HG--
extra : convert_revision : 03881885dd50ebbca13ef31f31492fd4ef59121c
---
 cpu/o3/alpha_cpu.hh           |  79 +++----
 cpu/o3/alpha_cpu_builder.cc   |  16 +-
 cpu/o3/alpha_cpu_impl.hh      | 187 +++++++---------
 cpu/o3/alpha_dyn_inst.hh      |  30 ++-
 cpu/o3/alpha_dyn_inst_impl.hh |   8 +-
 cpu/o3/alpha_params.hh        |   6 +-
 cpu/o3/commit.hh              | 105 ++++-----
 cpu/o3/commit_impl.hh         | 486 ++++++++++++++++++------------------------
 cpu/o3/cpu.cc                 |  86 ++++----
 cpu/o3/cpu.hh                 |  23 +-
 cpu/o3/decode_impl.hh         |   1 +
 cpu/o3/fetch.hh               |   7 +
 cpu/o3/fetch_impl.hh          |  20 +-
 cpu/o3/iew.hh                 |   2 +
 cpu/o3/iew_impl.hh            |  14 +-
 cpu/o3/lsq_unit_impl.hh       |   9 +
 cpu/o3/regfile.hh             |   4 +-
 cpu/o3/rename.hh              |   2 +
 cpu/o3/rename_impl.hh         |   7 +
 19 files changed, 522 insertions(+), 570 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index dfdf092ed..f70793aaa 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -34,6 +34,8 @@
 #include "cpu/o3/cpu.hh"
 #include "sim/byteswap.hh"
 
+class EndQuiesceEvent;
+
 template <class Impl>
 class AlphaFullCPU : public FullO3CPU<Impl>
 {
@@ -61,7 +63,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         Tick lastActivate;
         Tick lastSuspend;
 
-        Event *quiesceEvent;
+        EndQuiesceEvent *quiesceEvent;
 
         virtual BaseCPU *getCpuPtr() { return cpu; }
 
@@ -112,10 +114,8 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual void unserialize(Checkpoint *cp, const std::string &section);
 
 #if FULL_SYSTEM
-        virtual Event *getQuiesceEvent();
+        virtual EndQuiesceEvent *getQuiesceEvent();
 
-        // Not necessarily the best location for these...
-        // Having an extra function just to read these is obnoxious
         virtual Tick readLastActivate();
         virtual Tick readLastSuspend();
 
@@ -125,17 +125,12 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual int getThreadNum() { return thread->tid; }
 
-        // Also somewhat obnoxious.  Really only used for the TLB fault.
-        // However, may be quite useful in SPARC.
         virtual TheISA::MachInst getInst();
 
         virtual void copyArchRegs(ExecContext *xc);
 
         virtual void clearArchRegs();
 
-        //
-        // New accessors for new decoder.
-        //
         virtual uint64_t readIntReg(int reg_idx);
 
         virtual float readFloatRegSingle(int reg_idx);
@@ -172,9 +167,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
 
-        // Also not necessarily the best location for these two.
-        // Hopefully will go away once we decide upon where st cond
-        // failures goes.
+        // @todo: Figure out where these store cond failures should go.
         virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
 
         virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
@@ -183,27 +176,27 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
 #endif
 
-        // Only really makes sense for old CPU model.  Still could be useful though.
+        // Only really makes sense for old CPU model.  Lots of code
+        // outside the CPU still checks this function, so it will
+        // always return false to keep everything working.
         virtual bool misspeculating() { return false; }
 
 #if !FULL_SYSTEM
         virtual IntReg getSyscallArg(int i);
 
-        // used to shift args for indirect syscall
         virtual void setSyscallArg(int i, IntReg val);
 
         virtual void setSyscallReturn(SyscallReturn return_value);
 
         virtual void syscall() { return cpu->syscall(thread->tid); }
 
-        // Same with st cond failures.
         virtual Counter readFuncExeInst() { return thread->funcExeInst; }
 #endif
     };
 
-    friend class AlphaXC;
+//    friend class AlphaXC;
 
-    std::vector<AlphaXC *> xcProxies;
+//    std::vector<ExecContext *> xcProxies;
 
 #if FULL_SYSTEM
     /** ITB pointer. */
@@ -216,13 +209,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     void regStats();
 
 #if FULL_SYSTEM
-    //Note that the interrupt stuff from the base CPU might be somewhat
-    //ISA specific (ie NumInterruptLevels).  These functions might not
-    //be needed in FullCPU though.
-//    void post_interrupt(int int_num, int index);
-//    void clear_interrupt(int int_num, int index);
-//    void clear_interrupts();
-
     /** Translates instruction requestion. */
     Fault translateInstReq(MemReqPtr &req)
     {
@@ -273,11 +259,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     }
 
 #endif
-
-    // Later on may want to remove this misc stuff from the regfile and
-    // have it handled at this level.  This would be similar to moving certain
-    // IPRs into the devices themselves.  Might prove to be an issue when
-    // trying to rename source/destination registers...
     MiscReg readMiscReg(int misc_reg, unsigned tid);
 
     MiscReg readMiscRegWithEffect(int misc_reg, Fault &fault, unsigned tid);
@@ -302,18 +283,21 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
     /** Traps to handle given fault. */
     void trap(Fault fault, unsigned tid);
-    bool simPalCheck(int palFunc);
+    bool simPalCheck(int palFunc, unsigned tid);
 
     /** Processes any interrupts. */
     void processInterrupts();
+
+    /** Halts the CPU. */
+    void halt() { panic("Halt not implemented!\n"); }
 #endif
 
 
 #if !FULL_SYSTEM
-    // Need to change these into regfile calls that directly set a certain
-    // register.  Actually, these functions should handle most of this
-    // functionality by themselves; should look up the rename and then
-    // set the register.
+    /** Executes a syscall.
+     * @todo: Determine if this needs to be virtual.
+     */
+    void syscall(int thread_num);
     /** Gets a syscall argument. */
     IntReg getSyscallArg(int i, int tid);
 
@@ -322,25 +306,12 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
     /** Sets the return value of a syscall. */
     void setSyscallReturn(SyscallReturn return_value, int tid);
-
-    /** Executes a syscall.
-     * @todo: Determine if this needs to be virtual.
-     */
-    virtual void syscall(int thread_num);
-
-#endif
-
-  public:
-#if FULL_SYSTEM
-    /** Halts the CPU. */
-    void halt() { panic("Halt not implemented!\n"); }
 #endif
 
-    /** Old CPU read from memory function. No longer used. */
+    /** Read from memory function. */
     template <class T>
     Fault read(MemReqPtr &req, T &data)
     {
-//	panic("CPU READ NOT IMPLEMENTED W/NEW MEMORY\n");
 #if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
         if (req->flags & LOCKED) {
@@ -350,10 +321,14 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 #endif
         Fault error;
+
+#if FULL_SYSTEM
+        // @todo: Fix this LL/SC hack.
         if (req->flags & LOCKED) {
             lockAddr = req->paddr;
             lockFlag = true;
         }
+#endif
 
         error = this->mem->read(req, data);
         data = gtoh(data);
@@ -367,7 +342,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         return this->iew.ldstQueue.read(req, data, load_idx);
     }
 
-    /** Old CPU write to memory function. No longer used. */
+    /** Write to memory function. */
     template <class T>
     Fault write(MemReqPtr &req, T &data)
     {
@@ -420,11 +395,13 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
 #endif
 
+#if FULL_SYSTEM
+        // @todo: Fix this LL/SC hack.
         if (req->flags & LOCKED) {
             if (req->flags & UNCACHEABLE) {
                 req->result = 2;
             } else {
-                if (this->lockFlag/* && this->lockAddr == req->paddr*/) {
+                if (this->lockFlag) {
                     req->result = 1;
                 } else {
                     req->result = 0;
@@ -432,6 +409,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
                 }
             }
         }
+#endif
 
         return this->mem->write(req, (T)htog(data));
     }
@@ -444,6 +422,7 @@ class AlphaFullCPU : public FullO3CPU<Impl>
     }
 
     Addr lockAddr;
+
     bool lockFlag;
 };
 
diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index d676a69c1..0f9116d71 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -61,6 +61,8 @@ SimObjectVectorParam<Process *> workload;
 
 SimObjectParam<FunctionalMemory *> mem;
 
+SimObjectParam<BaseCPU *> checker;
+
 Param<Counter> max_insts_any_thread;
 Param<Counter> max_insts_all_threads;
 Param<Counter> max_loads_any_thread;
@@ -103,6 +105,8 @@ Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;
+Param<Tick> trapLatency;
+Param<Tick> fetchTrapLatency;
 
 Param<unsigned> localPredictorSize;
 Param<unsigned> localCtrBits;
@@ -165,6 +169,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     INIT_PARAM_DFLT(mem, "Memory", NULL),
 
+    INIT_PARAM_DFLT(checker, "Checker CPU", NULL),
+
     INIT_PARAM_DFLT(max_insts_any_thread,
                     "Terminate when any thread reaches this inst count",
                     0),
@@ -223,6 +229,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
     INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
     INIT_PARAM(commitWidth, "Commit width"),
     INIT_PARAM(squashWidth, "Squash width"),
+    INIT_PARAM_DFLT(trapLatency, "Number of cycles before the trap is handled", 6),
+    INIT_PARAM_DFLT(fetchTrapLatency, "Number of cycles before the fetch trap is handled", 12),
 
     INIT_PARAM(localPredictorSize, "Size of local predictor"),
     INIT_PARAM(localCtrBits, "Bits per counter"),
@@ -301,12 +309,13 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->dtb = dtb;
 #else
     params->workload = workload;
-    //@todo: change to pageTable
 //    params->pTable = page_table;
 #endif // FULL_SYSTEM
 
     params->mem = mem;
 
+    params->checker = checker;
+
     params->max_insts_any_thread = max_insts_any_thread;
     params->max_insts_all_threads = max_insts_all_threads;
     params->max_loads_any_thread = max_loads_any_thread;
@@ -351,7 +360,8 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
     params->renameToROBDelay = renameToROBDelay;
     params->commitWidth = commitWidth;
     params->squashWidth = squashWidth;
-
+    params->trapLatency = trapLatency;
+    params->fetchTrapLatency = fetchTrapLatency;
 
     params->localPredictorSize = localPredictorSize;
     params->localCtrBits = localCtrBits;
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 7a2d5d2b9..856fcb1c8 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,10 +30,9 @@
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
+#include "cpu/checker/exec_context.hh"
 #include "cpu/quiesce_event.hh"
-#include "mem/cache/cache.hh" // for dynamic cast
 #include "mem/mem_interface.hh"
-#include "sim/builder.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
 
@@ -63,11 +62,9 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
 
     for (int i = 0; i < this->numThreads; ++i) {
 #if FULL_SYSTEM
-        assert(i == 0);
+        assert(this->numThreads == 1);
         this->thread[i] = new Thread(this, 0, params->mem);
-//        this->system->execContexts[i] = this->thread[i]->getXCProxy();
         this->thread[i]->setStatus(ExecContext::Suspended);
-
 #else
         if (i < params->workload.size()) {
             DPRINTF(FullCPU, "FullCPU: Workload[%i]'s starting PC is %#x, "
@@ -91,19 +88,27 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
 
         this->thread[i]->numInst = 0;
 
-        xcProxies.push_back(new AlphaXC);
+        ExecContext *xc_proxy;
 
-        xcProxies[i]->cpu = this;
-        xcProxies[i]->thread = this->thread[i];
+        AlphaXC *alpha_xc_proxy = new AlphaXC;
 
-        xcProxies[i]->quiesceEvent = new EndQuiesceEvent(xcProxies[i]);
-        xcProxies[i]->lastActivate = 0;
-        xcProxies[i]->lastSuspend = 0;
+        if (params->checker) {
+            xc_proxy = new CheckerExecContext<AlphaXC>(alpha_xc_proxy, this->checker);
+        } else {
+            xc_proxy = alpha_xc_proxy;
+        }
 
+        alpha_xc_proxy->cpu = this;
+        alpha_xc_proxy->thread = this->thread[i];
 
-        this->thread[i]->xcProxy = xcProxies[i];
+        alpha_xc_proxy->quiesceEvent =
+            new EndQuiesceEvent(xc_proxy);
+        alpha_xc_proxy->lastActivate = 0;
+        alpha_xc_proxy->lastSuspend = 0;
 
-        this->execContexts.push_back(this->thread[i]->getXCProxy());
+        this->thread[i]->xcProxy = xc_proxy;
+
+        this->execContexts.push_back(xc_proxy);
     }
 
 
@@ -144,6 +149,7 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::dumpFuncProfile()
 {
+    // Currently not supported
 }
 #endif
 
@@ -167,6 +173,18 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     thread->funcExeInst = old_context->readFuncExeInst();
 #endif
 
+    EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
+    if (other_quiesce) {
+        // Point the quiesce event's XC at this XC so that it wakes up
+        // the proper CPU.
+        other_quiesce->xc = this;
+    }
+    if (thread->quiesceEvent) {
+        thread->quiesceEvent->xc = this;
+    }
+//    storeCondFailures = 0;
+    cpu->lockFlag = false;
+
     old_context->setStatus(ExecContext::Unallocated);
 
     thread->inSyscall = false;
@@ -178,7 +196,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
 {
     DPRINTF(FullCPU, "Calling activate on AlphaXC\n");
-//    warn("Calling activate on AlphaXC");
+
     if (thread->status() == ExecContext::Active)
         return;
 
@@ -200,7 +218,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::suspend()
 {
     DPRINTF(FullCPU, "Calling suspend on AlphaXC\n");
-//    warn("Calling suspend on AlphaXC");
+
     if (thread->status() == ExecContext::Suspended)
         return;
 
@@ -224,7 +242,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::deallocate()
 {
     DPRINTF(FullCPU, "Calling deallocate on AlphaXC\n");
-//    warn("Calling deallocate on AlphaXC");
+
     if (thread->status() == ExecContext::Unallocated)
         return;
 
@@ -237,7 +255,7 @@ void
 AlphaFullCPU<Impl>::AlphaXC::halt()
 {
     DPRINTF(FullCPU, "Calling halt on AlphaXC\n");
-//    warn("Calling halt on AlphaXC");
+
     if (thread->status() == ExecContext::Halted)
         return;
 
@@ -254,6 +272,7 @@ template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
 {}
+
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
@@ -261,7 +280,7 @@ AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &sect
 
 #if FULL_SYSTEM
 template <class Impl>
-Event *
+EndQuiesceEvent *
 AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
 {
     return quiesceEvent;
@@ -345,9 +364,6 @@ void
 AlphaFullCPU<Impl>::AlphaXC::clearArchRegs()
 {}
 
-//
-// New accessors for new decoder.
-//
 template <class Impl>
 uint64_t
 AlphaFullCPU<Impl>::AlphaXC::readIntReg(int reg_idx)
@@ -503,26 +519,6 @@ AlphaFullCPU<Impl>::AlphaXC::setSyscallReturn(SyscallReturn return_value)
     cpu->setSyscallReturn(return_value, thread->tid);
 }
 
-template <class Impl>
-void
-AlphaFullCPU<Impl>::syscall(int tid)
-{
-    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
-
-    DPRINTF(Activity,"Activity: syscall() called.\n");
-
-    // Temporarily increase this by one to account for the syscall
-    // instruction.
-    ++(this->thread[tid]->funcExeInst);
-
-    // Execute the actual syscall.
-    this->thread[tid]->syscall();
-
-    // Decrease funcExeInst by one as the normal commit will handle
-    // incrementing it.
-    --(this->thread[tid]->funcExeInst);
-}
-
 #endif // FULL_SYSTEM
 
 template <class Impl>
@@ -544,14 +540,7 @@ template <class Impl>
 Fault
 AlphaFullCPU<Impl>::setMiscReg(int misc_reg, const MiscReg &val, unsigned tid)
 {
-    // I think that these registers should always be set, regardless of what
-    // mode the thread is in.  The main difference is if the thread needs to
-    // squash as a result of the write, which is controlled by the AlphaXC.
-//    if (!this->thread[tid]->trapPending) {
-        return this->regFile.setMiscReg(misc_reg, val, tid);
-//    } else {
-//        return NoFault;
-//    }
+    return this->regFile.setMiscReg(misc_reg, val, tid);
 }
 
 template <class Impl>
@@ -559,18 +548,13 @@ Fault
 AlphaFullCPU<Impl>::setMiscRegWithEffect(int misc_reg, const MiscReg &val,
                                          unsigned tid)
 {
-//    if (!this->thread[tid]->trapPending) {
-        return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
-//    } else {
-//        return NoFault;
-//    }
+    return this->regFile.setMiscRegWithEffect(misc_reg, val, tid);
 }
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::squashFromXC(unsigned tid)
 {
-//    this->thread[tid]->trapPending = true;
     this->thread[tid]->inSyscall = true;
     this->commit.generateXCEvent(tid);
 }
@@ -585,7 +569,8 @@ AlphaFullCPU<Impl>::post_interrupt(int int_num, int index)
 
     if (this->thread[0]->status() == ExecContext::Suspended) {
         DPRINTF(IPI,"Suspended Processor awoke\n");
-        xcProxies[0]->activate();
+//	xcProxies[0]->activate();
+        this->execContexts[0]->activate();
     }
 }
 
@@ -607,31 +592,24 @@ template <class Impl>
 Fault
 AlphaFullCPU<Impl>::hwrei(unsigned tid)
 {
-#if 0
-    if (!inPalMode(this->readPC(tid)))
-        return new AlphaISA::UnimplementedOpcodeFault;
-
-    setNextPC(cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR, tid), tid);
+    // Need to clear the lock flag upon returning from an interrupt.
+    this->lockFlag = false;
 
-    cpu->kernelStats->hwrei();
+    this->kernelStats->hwrei();
 
-//    if ((this->regFile.miscRegs[tid].readReg(AlphaISA::IPR_EXC_ADDR) & 1) == 0)
-//        AlphaISA::swap_palshadow(&regs, false);
+    this->checkInterrupts = true;
 
-    cpu->checkInterrupts = true;
-#endif
-//    panic("Do not call this function!");
-    // Need to clear the lock flag upon returning from an interrupt.
-    this->lockFlag = false;
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
 
 template <class Impl>
 bool
-AlphaFullCPU<Impl>::simPalCheck(int palFunc)
+AlphaFullCPU<Impl>::simPalCheck(int palFunc, unsigned tid)
 {
-//    kernelStats.callpal(palFunc);
+    if (this->kernelStats)
+        this->kernelStats->callpal(palFunc,
+                                   this->execContexts[tid]);
 
     switch (palFunc) {
       case PAL::halt:
@@ -650,47 +628,11 @@ AlphaFullCPU<Impl>::simPalCheck(int palFunc)
     return true;
 }
 
-// Probably shouldn't be able to switch to the trap handler as quickly as
-// this.  Also needs to get the exception restart address from the commit
-// stage.
 template <class Impl>
 void
 AlphaFullCPU<Impl>::trap(Fault fault, unsigned tid)
 {
-
-    fault->invoke(this->xcProxies[tid]);
-/*    // Keep in mind that a trap may be initiated by fetch if there's a TLB
-    // miss
-    uint64_t PC = this->commit.readCommitPC();
-
-    DPRINTF(Fault, "Fault %s\n", fault->name());
-    this->recordEvent(csprintf("Fault %s", fault->name()));
-
-    //kernelStats.fault(fault);
-
-    if (fault->isA<ArithmeticFault>())
-        panic("Arithmetic traps are unimplemented!");
-
-    // exception restart address - Get the commit PC
-    if (!fault->isA<InterruptFault>() || !inPalMode(PC))
-        this->regFile.miscRegs.setReg(AlphaISA::IPR_EXC_ADDR, PC);
-
-    if (fault->isA<PalFault>() || fault->isA<ArithmeticFault>())
-    //    || fault == InterruptFault && !PC_PAL(regs.pc)
-        {
-        // traps...  skip faulting instruction
-        AlphaISA::MiscReg ipr_exc_addr =
-            this->regFile.miscRegs.readReg(AlphaISA::IPR_EXC_ADDR);
-        this->regFile.miscRegs.setReg(AlphaISA::IPR_EXC_ADDR,
-                                      ipr_exc_addr + 4);
-    }
-
-    if (!inPalMode(PC))
-        swapPALShadow(true);
-
-    this->regFile.setPC(this->regFile.miscRegs.readReg(AlphaISA::IPR_PAL_BASE) +
-                         (dynamic_cast<AlphaFault *>(fault.get()))->vect(), 0);
-    this->regFile.setNextPC(PC + sizeof(MachInst), 0);*/
+    fault->invoke(this->execContexts[tid]);
 }
 
 template <class Impl>
@@ -700,6 +642,8 @@ AlphaFullCPU<Impl>::processInterrupts()
     // Check for interrupts here.  For now can copy the code that
     // exists within isa_fullsys_traits.hh.  Also assume that thread 0
     // is the one that handles the interrupts.
+    // @todo: Possibly consolidate the interrupt checking code.
+    // @todo: Allow other threads to handle interrupts.
 
     // Check if there are any outstanding interrupts
     //Handle the interrupts
@@ -738,6 +682,10 @@ AlphaFullCPU<Impl>::processInterrupts()
     if (ipl && ipl > this->readMiscReg(IPR_IPLR, 0)) {
         this->setMiscReg(IPR_ISR, summary, 0);
         this->setMiscReg(IPR_INTID, ipl, 0);
+        if (this->checker) {
+            this->checker->cpuXCBase()->setMiscReg(IPR_ISR, summary);
+            this->checker->cpuXCBase()->setMiscReg(IPR_INTID, ipl);
+        }
         this->trap(Fault(new InterruptFault), 0);
         DPRINTF(Flow, "Interrupt! IPLR=%d ipl=%d summary=%x\n",
                 this->readMiscReg(IPR_IPLR, 0), ipl, summary);
@@ -747,6 +695,27 @@ AlphaFullCPU<Impl>::processInterrupts()
 #endif // FULL_SYSTEM
 
 #if !FULL_SYSTEM
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::syscall(int tid)
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: [tid:%i] Executing syscall().\n\n", tid);
+
+    DPRINTF(Activity,"Activity: syscall() called.\n");
+
+    // Temporarily increase this by one to account for the syscall
+    // instruction.
+    ++(this->thread[tid]->funcExeInst);
+
+    // Execute the actual syscall.
+    this->thread[tid]->syscall();
+
+    // Decrease funcExeInst by one as the normal commit will handle
+    // incrementing it.
+    --(this->thread[tid]->funcExeInst);
+}
+
 template <class Impl>
 TheISA::IntReg
 AlphaFullCPU<Impl>::getSyscallArg(int i, int tid)
diff --git a/cpu/o3/alpha_dyn_inst.hh b/cpu/o3/alpha_dyn_inst.hh
index 24774bd0a..1c5b738aa 100644
--- a/cpu/o3/alpha_dyn_inst.hh
+++ b/cpu/o3/alpha_dyn_inst.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,14 +35,11 @@
 #include "cpu/o3/alpha_impl.hh"
 
 /**
- * Mostly implementation & ISA specific AlphaDynInst. As with most other classes
- * in the new CPU model, it is templated on the Impl to allow for passing in of
- * all types, such as the CPU type and the ISA type. The AlphaDynInst serves
- * as the primary interface to the CPU; it plays the role that the ExecContext
- * does for the old CPU and the SimpleCPU. The goal is to abstract ExecContext
- * purely into an interface, and have it forward calls to the appropriate
- * CPU interface, which in the new CPU model's case would be this AlphaDynInst,
- * or any other high level implementation specific DynInst.
+ * Mostly implementation & ISA specific AlphaDynInst. As with most
+ * other classes in the new CPU model, it is templated on the Impl to
+ * allow for passing in of all types, such as the CPU type and the ISA
+ * type. The AlphaDynInst serves as the primary interface to the CPU
+ * for instructions that are executing.
  */
 template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
@@ -78,8 +75,10 @@ class AlphaDynInst : public BaseDynInst<Impl>
     /** Executes the instruction.*/
     Fault execute();
 
+    /** Initiates the access.  Only valid for memory operations. */
     Fault initiateAcc();
 
+    /** Completes the access.  Only valid for memory operations. */
     Fault completeAcc();
 
   private:
@@ -100,6 +99,7 @@ class AlphaDynInst : public BaseDynInst<Impl>
 
     Fault setMiscReg(int misc_reg, const MiscReg &val)
     {
+        this->instResult.integer = val;
         return this->cpu->setMiscReg(misc_reg, val, this->threadNumber);
     }
 
@@ -126,8 +126,6 @@ class AlphaDynInst : public BaseDynInst<Impl>
     void syscall();
 #endif
 
-
-
   private:
     /** Physical register index of the destination registers of this
      *  instruction.
@@ -247,9 +245,9 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
   public:
-    /** Calculates EA part of a memory instruction. Currently unused, though
-     * it may be useful in the future when memory instructions aren't
-     * executed with the EA calculation and the memory access being atomic.
+    /** Calculates EA part of a memory instruction. Currently unused,
+     * though it may be useful in the future if we want to split
+     * memory operations into EA calculation and memory access parts.
      */
     Fault calcEA()
     {
@@ -257,8 +255,8 @@ class AlphaDynInst : public BaseDynInst<Impl>
     }
 
     /** Does the memory access part of a memory instruction. Currently unused,
-     * though it may be useful in the future when memory instructions aren't
-     * executed with the EA calculation and the memory access being atomic.
+     * though it may be useful in the future if we want to split
+     * memory operations into EA calculation and memory access parts.
      */
     Fault memAccess()
     {
diff --git a/cpu/o3/alpha_dyn_inst_impl.hh b/cpu/o3/alpha_dyn_inst_impl.hh
index b5999f8d1..541d5ab82 100644
--- a/cpu/o3/alpha_dyn_inst_impl.hh
+++ b/cpu/o3/alpha_dyn_inst_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,13 +124,9 @@ AlphaDynInst<Impl>::hwrei()
     this->setNextPC(this->cpu->readMiscReg(AlphaISA::IPR_EXC_ADDR,
                                            this->threadNumber));
 
-    this->cpu->kernelStats->hwrei();
-
     // Tell CPU to clear any state it needs to if a hwrei is taken.
     this->cpu->hwrei(this->threadNumber);
 
-    this->cpu->checkInterrupts = true;
-
     // FIXME: XXX check for interrupts? XXX
     return NoFault;
 }
@@ -167,7 +163,7 @@ template <class Impl>
 bool
 AlphaDynInst<Impl>::simPalCheck(int palFunc)
 {
-    return this->cpu->simPalCheck(palFunc);
+    return this->cpu->simPalCheck(palFunc, this->threadNumber);
 }
 #else
 template <class Impl>
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index 04b790815..b8ebae21e 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -62,6 +62,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
 
     FunctionalMemory *mem;
 
+    BaseCPU *checker;
+
     //
     // Caches
     //
@@ -117,6 +119,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned renameToROBDelay;
     unsigned commitWidth;
     unsigned squashWidth;
+    Tick trapLatency;
+    Tick fetchTrapLatency;
 
     //
     // Branch predictor (BP & BTB)
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 028bd5295..73eccd2b0 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -40,25 +40,27 @@ template <class>
 class O3ThreadState;
 
 /**
- * DefaultCommit handles single threaded and SMT commit. Its width is specified
- * by the parameters; each cycle it tries to commit that many instructions. The
- * SMT policy decides which thread it tries to commit instructions from. Non-
- * speculative instructions must reach the head of the ROB before they are
- * ready to execute; once they reach the head, commit will broadcast the
- * instruction's sequence number to the previous stages so that they can issue/
- * execute the instruction. Only one non-speculative instruction is handled per
- * cycle. Commit is responsible for handling all back-end initiated redirects.
- * It receives the redirect, and then broadcasts it to all stages, indicating
- * the sequence number they should squash until, and any necessary branch mis-
- * prediction information as well. It priortizes redirects by instruction's age,
- * only broadcasting a redirect if it corresponds to an instruction that should
- * currently be in the ROB. This is done by tracking the sequence number of the
- * youngest instruction in the ROB, which gets updated to any squashing
- * instruction's sequence number, and only broadcasting a redirect if it
- * corresponds to an older instruction. Commit also supports multiple cycle
- * squashing, to model a ROB that can only remove a certain number of
- * instructions per cycle. Eventually traps and interrupts will most likely
- * be handled here as well.
+ * DefaultCommit handles single threaded and SMT commit. Its width is
+ * specified by the parameters; each cycle it tries to commit that
+ * many instructions. The SMT policy decides which thread it tries to
+ * commit instructions from. Non- speculative instructions must reach
+ * the head of the ROB before they are ready to execute; once they
+ * reach the head, commit will broadcast the instruction's sequence
+ * number to the previous stages so that they can issue/ execute the
+ * instruction. Only one non-speculative instruction is handled per
+ * cycle. Commit is responsible for handling all back-end initiated
+ * redirects.  It receives the redirect, and then broadcasts it to all
+ * stages, indicating the sequence number they should squash until,
+ * and any necessary branch misprediction information as well. It
+ * priortizes redirects by instruction's age, only broadcasting a
+ * redirect if it corresponds to an instruction that should currently
+ * be in the ROB. This is done by tracking the sequence number of the
+ * youngest instruction in the ROB, which gets updated to any
+ * squashing instruction's sequence number, and only broadcasting a
+ * redirect if it corresponds to an older instruction. Commit also
+ * supports multiple cycle squashing, to model a ROB that can only
+ * remove a certain number of instructions per cycle. Eventually traps
+ * and interrupts will most likely be handled here as well.
  */
 template<class Impl>
 class DefaultCommit
@@ -78,6 +80,7 @@ class DefaultCommit
     typedef typename CPUPol::IEWStruct IEWStruct;
     typedef typename CPUPol::RenameStruct RenameStruct;
 
+    typedef typename CPUPol::Fetch Fetch;
     typedef typename CPUPol::IEW IEW;
 
     typedef O3ThreadState<Impl> Thread;
@@ -155,11 +158,16 @@ class DefaultCommit
     /** Sets the pointer to the queue coming from IEW. */
     void setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr);
 
+    void setFetchStage(Fetch *fetch_stage);
+
+    Fetch *fetchStage;
+
     /** Sets the poitner to the IEW stage. */
     void setIEWStage(IEW *iew_stage);
 
-    /** The pointer to the IEW stage. Used solely to ensure that syscalls do
-     * not execute until all stores have written back.
+    /** The pointer to the IEW stage. Used solely to ensure that
+     * various events (traps, interrupts, syscalls) do not occur until
+     * all stores have written back.
      */
     IEW *iewStage;
 
@@ -177,6 +185,8 @@ class DefaultCommit
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     /** Ticks the commit stage, which tries to commit instructions. */
@@ -213,13 +223,12 @@ class DefaultCommit
      */
     bool changedROBEntries();
 
+    void squashAll(unsigned tid);
+
     void squashFromTrap(unsigned tid);
 
     void squashFromXC(unsigned tid);
 
-    void squashInFlightInsts(unsigned tid);
-
-  private:
     /** Commits as many instructions as possible. */
     void commitInsts();
 
@@ -246,8 +255,10 @@ class DefaultCommit
     int oldestReady();
 
   public:
-    /** Returns the PC of the head instruction of the ROB. */
-    uint64_t readPC();
+    /** Returns the PC of the head instruction of the ROB.
+     * @todo: Probably remove this function as it returns only thread 0.
+     */
+    uint64_t readPC() { return PC[0]; }
 
     uint64_t readPC(unsigned tid) { return PC[tid]; }
 
@@ -257,9 +268,6 @@ class DefaultCommit
 
     void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; }
 
-    /** Sets that the ROB is currently squashing. */
-    void setSquashing(unsigned tid);
-
   private:
     /** Time buffer interface. */
     TimeBuffer<TimeStruct> *timeBuffer;
@@ -299,10 +307,10 @@ class DefaultCommit
 
     std::vector<Thread *> thread;
 
-  private:
     Fault fetchFault;
-    InstSeqNum fetchFaultSN;
+
     int fetchTrapWait;
+
     /** Records that commit has written to the time buffer this cycle. Used for
      * the CPU to determine if it can deschedule itself if there is no activity.
      */
@@ -355,11 +363,13 @@ class DefaultCommit
     /** Number of Active Threads */
     unsigned numThreads;
 
+    bool switchPending;
     bool switchedOut;
 
     Tick trapLatency;
 
     Tick fetchTrapLatency;
+
     Tick fetchFaultTick;
 
     Addr PC[Impl::MaxThreads];
@@ -390,27 +400,26 @@ class DefaultCommit
      * speculative instruction reaching the head of the ROB.
      */
     Stats::Scalar<> commitNonSpecStalls;
-    /** Stat for the total number of committed branches. */
-//    Stats::Scalar<> commitCommittedBranches;
-    /** Stat for the total number of committed loads. */
-//    Stats::Scalar<> commitCommittedLoads;
-    /** Stat for the total number of committed memory references. */
-//    Stats::Scalar<> commitCommittedMemRefs;
     /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar<> branchMispredicts;
     /** Distribution of the number of committed instructions each cycle. */
     Stats::Distribution<> numCommittedDist;
 
-    // total number of instructions committed
-    Stats::Vector<> stat_com_inst;
-    Stats::Vector<> stat_com_swp;
-    Stats::Vector<> stat_com_refs;
-    Stats::Vector<> stat_com_loads;
-    Stats::Vector<> stat_com_membars;
-    Stats::Vector<> stat_com_branches;
-
-    Stats::Scalar<> commit_eligible_samples;
-    Stats::Vector<> commit_eligible;
+    /** Total number of instructions committed. */
+    Stats::Vector<> statComInst;
+    /** Total number of software prefetches committed. */
+    Stats::Vector<> statComSwp;
+    /** Stat for the total number of committed memory references. */
+    Stats::Vector<> statComRefs;
+    /** Stat for the total number of committed loads. */
+    Stats::Vector<> statComLoads;
+    /** Total number of committed memory barriers. */
+    Stats::Vector<> statComMembars;
+    /** Total number of committed branches. */
+    Stats::Vector<> statComBranches;
+
+    Stats::Scalar<> commitEligibleSamples;
+    Stats::Vector<> commitEligible;
 };
 
 #endif // __CPU_O3_COMMIT_HH__
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 034565f90..170f5b01f 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -36,6 +36,7 @@
 
 #include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
+#include "cpu/checker/cpu.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/o3/commit.hh"
 #include "cpu/o3/thread_state.hh"
@@ -54,7 +55,8 @@ template <class Impl>
 void
 DefaultCommit<Impl>::TrapEvent::process()
 {
-    // This will get reset if it was switched out.
+    // This will get reset by commit if it was switched out at the
+    // time of this event processing.
     commit->trapSquash[tid] = true;
 }
 
@@ -77,7 +79,9 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
       iewWidth(params->executeWidth),
       commitWidth(params->commitWidth),
       numThreads(params->numberOfThreads),
-      switchedOut(false)
+      switchedOut(false),
+      trapLatency(params->trapLatency),
+      fetchTrapLatency(params->fetchTrapLatency)
 {
     _status = Active;
     _nextStatus = Inactive;
@@ -117,9 +121,6 @@ DefaultCommit<Impl>::DefaultCommit(Params *params)
         xcSquash[i] = false;
     }
 
-    // Hardcoded trap latency.
-    trapLatency = 6;
-    fetchTrapLatency = 12;
     fetchFaultTick = 0;
     fetchTrapWait = 0;
 }
@@ -153,20 +154,6 @@ DefaultCommit<Impl>::regStats()
         .desc("The number of times commit has been forced to stall to "
               "communicate backwards")
         .prereq(commitNonSpecStalls);
-/*
-    commitCommittedBranches
-        .name(name() + ".commitCommittedBranches")
-        .desc("The number of committed branches")
-        .prereq(commitCommittedBranches);
-    commitCommittedLoads
-        .name(name() + ".commitCommittedLoads")
-        .desc("The number of committed loads")
-        .prereq(commitCommittedLoads);
-    commitCommittedMemRefs
-        .name(name() + ".commitCommittedMemRefs")
-        .desc("The number of committed memory references")
-        .prereq(commitCommittedMemRefs);
-*/
     branchMispredicts
         .name(name() + ".branchMispredicts")
         .desc("The number of times a branch was mispredicted")
@@ -178,42 +165,42 @@ DefaultCommit<Impl>::regStats()
         .flags(Stats::pdf)
         ;
 
-    stat_com_inst
+    statComInst
         .init(cpu->number_of_threads)
         .name(name() + ".COM:count")
         .desc("Number of instructions committed")
         .flags(total)
         ;
 
-    stat_com_swp
+    statComSwp
         .init(cpu->number_of_threads)
         .name(name() + ".COM:swp_count")
         .desc("Number of s/w prefetches committed")
         .flags(total)
         ;
 
-    stat_com_refs
+    statComRefs
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:refs")
         .desc("Number of memory references committed")
         .flags(total)
         ;
 
-    stat_com_loads
+    statComLoads
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:loads")
         .desc("Number of loads committed")
         .flags(total)
         ;
 
-    stat_com_membars
+    statComMembars
         .init(cpu->number_of_threads)
         .name(name() +  ".COM:membars")
         .desc("Number of memory barriers committed")
         .flags(total)
         ;
 
-    stat_com_branches
+    statComBranches
         .init(cpu->number_of_threads)
         .name(name() + ".COM:branches")
         .desc("Number of branches committed")
@@ -233,14 +220,14 @@ DefaultCommit<Impl>::regStats()
     //  -> The standard deviation is computed only over cycles where
     //  we reached the BW limit
     //
-    commit_eligible
+    commitEligible
         .init(cpu->number_of_threads)
         .name(name() + ".COM:bw_limited")
         .desc("number of insts not committed due to BW limits")
         .flags(total)
         ;
 
-    commit_eligible_samples
+    commitEligibleSamples
         .name(name() + ".COM:bw_lim_events")
         .desc("number cycles where commit BW limit reached")
         ;
@@ -257,8 +244,8 @@ DefaultCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     // the simulation, so it starts as active.
     cpu->activateStage(FullCPU::CommitIdx);
 
-    trapLatency = cpu->cycles(6);
-    fetchTrapLatency = cpu->cycles(12);
+    trapLatency = cpu->cycles(trapLatency);
+    fetchTrapLatency = cpu->cycles(fetchTrapLatency);
 }
 
 template <class Impl>
@@ -315,6 +302,13 @@ DefaultCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
     fromIEW = iewQueue->getWire(-iewToCommitDelay);
 }
 
+template <class Impl>
+void
+DefaultCommit<Impl>::setFetchStage(Fetch *fetch_stage)
+{
+    fetchStage = fetch_stage;
+}
+
 template <class Impl>
 void
 DefaultCommit<Impl>::setIEWStage(IEW *iew_stage)
@@ -369,6 +363,15 @@ template <class Impl>
 void
 DefaultCommit<Impl>::switchOut()
 {
+    switchPending = true;
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::doSwitchOut()
+{
+    switchedOut = true;
+    switchPending = false;
     rob->switchOut();
 }
 
@@ -376,6 +379,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::takeOverFrom()
 {
+    switchedOut = false;
     _status = Active;
     _nextStatus = Inactive;
     for (int i=0; i < numThreads; i++) {
@@ -392,9 +396,17 @@ template <class Impl>
 void
 DefaultCommit<Impl>::updateStatus()
 {
-    if (commitStatus[0] == TrapPending ||
-        commitStatus[0] == FetchTrapPending) {
-        _nextStatus = Active;
+    // reset ROB changed variable
+    list<unsigned>::iterator threads = (*activeThreads).begin();
+    while (threads != (*activeThreads).end()) {
+        unsigned tid = *threads++;
+        changedROBNumEntries[tid] = false;
+
+        // Also check if any of the threads has a trap pending
+        if (commitStatus[tid] == TrapPending ||
+            commitStatus[tid] == FetchTrapPending) {
+            _nextStatus = Active;
+        }
     }
 
     if (_nextStatus == Inactive && _status == Active) {
@@ -406,13 +418,6 @@ DefaultCommit<Impl>::updateStatus()
     }
 
     _status = _nextStatus;
-
-    // reset ROB changed variable
-    list<unsigned>::iterator threads = (*activeThreads).begin();
-    while (threads != (*activeThreads).end()) {
-        unsigned tid = *threads++;
-        changedROBNumEntries[tid] = false;
-    }
 }
 
 template <class Impl>
@@ -488,14 +493,14 @@ DefaultCommit<Impl>::generateXCEvent(unsigned tid)
 
 template <class Impl>
 void
-DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+DefaultCommit<Impl>::squashAll(unsigned tid)
 {
     // If we want to include the squashing instruction in the squash,
     // then use one older sequence number.
     // Hopefully this doesn't mess things up.  Basically I want to squash
     // all instructions of this thread.
     InstSeqNum squashed_inst = rob->isEmpty() ?
-        0 : rob->readHeadInst(tid)->seqNum - 1;
+        0 : rob->readHeadInst(tid)->seqNum - 1;;
 
     // All younger instructions will be squashed. Set the sequence
     // number as the youngest instruction in the ROB (0 in this case.
@@ -518,21 +523,22 @@ DefaultCommit<Impl>::squashFromTrap(unsigned tid)
 
     toIEW->commitInfo[tid].branchMispredict = false;
 
-//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
-
     toIEW->commitInfo[tid].nextPC = PC[tid];
+}
+
+template <class Impl>
+void
+DefaultCommit<Impl>::squashFromTrap(unsigned tid)
+{
+    squashAll(tid);
 
     DPRINTF(Commit, "Squashing from trap, restarting at PC %#x\n", PC[tid]);
-    // Hopefully nobody tries to use the mispredPC becuase I said there
-    // wasn't a branch mispredict.
-//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
 
     thread[tid]->trapPending = false;
     thread[tid]->inSyscall = false;
 
     trapSquash[tid] = false;
 
-    // Not sure what to set this to...
     commitStatus[tid] = ROBSquashing;
     cpu->activityThisCycle();
 
@@ -543,49 +549,13 @@ template <class Impl>
 void
 DefaultCommit<Impl>::squashFromXC(unsigned tid)
 {
-    // For now these are identical.  In the future, the squash from trap
-    // might execute the trap prior to the squash.
-
-    // If we want to include the squashing instruction in the squash,
-    // then use one older sequence number.
-    // Hopefully this doesn't mess things up.  Basically I want to squash
-    // all instructions of this thread.
-    InstSeqNum squashed_inst = rob->isEmpty() ?
-        0 : rob->readHeadInst(tid)->seqNum - 1;;
-
-    // All younger instructions will be squashed. Set the sequence
-    // number as the youngest instruction in the ROB (0 in this case.
-    // Hopefully nothing breaks.)
-    youngestSeqNum[tid] = 0;
-
-    rob->squash(squashed_inst, tid);
-    changedROBNumEntries[tid] = true;
-
-    // Send back the sequence number of the squashed instruction.
-    toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
-
-    // Send back the squash signal to tell stages that they should
-    // squash.
-    toIEW->commitInfo[tid].squash = true;
-
-    // Send back the rob squashing signal so other stages know that
-    // the ROB is in the process of squashing.
-    toIEW->commitInfo[tid].robSquashing = true;
-
-    toIEW->commitInfo[tid].branchMispredict = false;
-
-//    toIEW->commitInfo[tid].branchTaken = fromIEW->branchTaken[tid];
-
-    toIEW->commitInfo[tid].nextPC = PC[tid];
+    squashAll(tid);
 
     DPRINTF(Commit, "Squashing from XC, restarting at PC %#x\n", PC[tid]);
-    // Hopefully nobody tries to use the mispredPC becuase I said there
-    // wasn't a branch mispredict.
-//    toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
 
     thread[tid]->inSyscall = false;
     assert(!thread[tid]->trapPending);
-    // Not sure what to set this to...
+
     commitStatus[tid] = ROBSquashing;
     cpu->activityThisCycle();
 
@@ -594,22 +564,6 @@ DefaultCommit<Impl>::squashFromXC(unsigned tid)
     ++squashCounter;
 }
 
-template <class Impl>
-void
-DefaultCommit<Impl>::squashInFlightInsts(unsigned tid)
-{
-    // @todo: Fix this hardcoded number.
-    for (int i = 0; i < -5; ++i) {
-        for (int j = 0; j < (*iewQueue)[i].size; ++j) {
-            DynInstPtr inst = (*iewQueue)[i].insts[j];
-            if (inst->threadNumber == tid &&
-                !inst->isSquashed()) {
-                inst->setSquashed();
-            }
-        }
-    }
-}
-
 template <class Impl>
 void
 DefaultCommit<Impl>::tick()
@@ -617,13 +571,15 @@ DefaultCommit<Impl>::tick()
     wroteToTimeBuffer = false;
     _nextStatus = Inactive;
 
-    // If the ROB is currently in its squash sequence, then continue
-    // to squash.  In this case, commit does not do anything.  Otherwise
-    // run commit.
+    if (switchPending && rob->isEmpty() && !iewStage->hasStoresToWB()) {
+        cpu->signalSwitched();
+        return;
+    }
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    // Maybe this should be dependent upon any of the commits actually
-    // squashing.
+    // Check if any of the threads are done squashing.  Change the
+    // status if they are done.
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
@@ -673,7 +629,7 @@ DefaultCommit<Impl>::tick()
 
 
     if (wroteToTimeBuffer) {
-        DPRINTF(Activity,"Activity This Cycle.\n");
+        DPRINTF(Activity, "Activity This Cycle.\n");
         cpu->activityThisCycle();
     }
 
@@ -689,28 +645,23 @@ DefaultCommit<Impl>::commit()
     // Check for interrupts
     //////////////////////////////////////
 
-    // Process interrupts if interrupts are enabled and not in PAL mode.
-    // Take the PC from commit and write it to the IPR, then squash.  The
-    // interrupt completing will take care of restoring the PC from that value
-    // in the IPR.  Look at IPR[EXC_ADDR];
-    // hwrei() is what resets the PC to the place where instruction execution
-    // beings again.
 #if FULL_SYSTEM
-//#if 0
+    // Process interrupts if interrupts are enabled, not in PAL mode,
+    // and no other traps or external squashes are currently pending.
+    // @todo: Allow other threads to handle interrupts.
     if (cpu->checkInterrupts &&
         cpu->check_interrupts() &&
         !cpu->inPalMode(readPC()) &&
         !trapSquash[0] &&
         !xcSquash[0]) {
-//        commitStatus[0] = TrapPending;
+        // Tell fetch that there is an interrupt pending.  This will
+        // make fetch wait until it sees a non PAL-mode PC, at which
+        // point it stops fetching instructions.
         toIEW->commitInfo[0].interruptPending = true;
-        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
-            // Will need to squash all instructions currently in flight and have
-            // the interrupt handler restart at the last non-committed inst.
-            // Most of that can be handled through the trap() function.  The
-            // processInterrupts() function really just checks for interrupts
-            // and then calls trap() if there is an interrupt present.
 
+        // Wait until the ROB is empty and all stores have drained in
+        // order to enter the interrupt.
+        if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
             // Not sure which thread should be the one to interrupt.  For now
             // always do thread 0.
             assert(!thread[0]->inSyscall);
@@ -738,26 +689,27 @@ DefaultCommit<Impl>::commit()
 #endif // FULL_SYSTEM
 
     ////////////////////////////////////
-    // Check for squash signal, handle that first
+    // Check for any possible squashes, handle them first
     ////////////////////////////////////
 
-    // Check if the IEW stage is telling the ROB to squash.
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
     while (threads != (*activeThreads).end()) {
         unsigned tid = *threads++;
 
         if (fromFetch->fetchFault && commitStatus[0] != TrapPending) {
-            // Record the fault.  Wait until it's empty in the ROB.  Then handle the trap.
-            // Ignore it if there's already a trap pending as fetch will be redirected.
+            // Record the fault.  Wait until it's empty in the ROB.
+            // Then handle the trap.  Ignore it if there's already a
+            // trap pending as fetch will be redirected.
             fetchFault = fromFetch->fetchFault;
-            fetchFaultSN = fromFetch->fetchFaultSN;
             fetchFaultTick = curTick + fetchTrapLatency;
             commitStatus[0] = FetchTrapPending;
             DPRINTF(Commit, "Fault from fetch recorded.  Will trap if the "
                     "ROB empties without squashing the fault.\n");
             fetchTrapWait = 0;
         }
+
+        // Fetch may tell commit to clear the trap if it's been squashed.
         if (fromFetch->clearFetchFault) {
             DPRINTF(Commit, "Received clear fetch fault signal\n");
             fetchTrapWait = 0;
@@ -783,10 +735,6 @@ DefaultCommit<Impl>::commit()
             commitStatus[tid] != TrapPending &&
             fromIEW->squashedSeqNum[tid] <= youngestSeqNum[tid]) {
 
-            DPRINTF(Commit, "[tid:%u]: Squashing instructions in the "
-                    "ROB.\n",
-                    tid);
-
             DPRINTF(Commit, "[tid:%i]: Squashing due to PC %#x [sn:%i]\n",
                     tid,
                     fromIEW->mispredPC[tid],
@@ -814,11 +762,8 @@ DefaultCommit<Impl>::commit()
             rob->squash(squashed_inst, tid);
             changedROBNumEntries[tid] = true;
 
-            // Send back the sequence number of the squashed instruction.
             toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
 
-            // Send back the squash signal to tell stages that they should
-            // squash.
             toIEW->commitInfo[tid].squash = true;
 
             // Send back the rob squashing signal so other stages know that
@@ -833,11 +778,7 @@ DefaultCommit<Impl>::commit()
 
             toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid];
 
-            DPRINTF(Commit, "Squashing from IEW, restarting at PC %#x\n",
-                    fromIEW->nextPC[tid]);
-
-            toIEW->commitInfo[tid].mispredPC =
-                fromIEW->mispredPC[tid];
+            toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];
 
             if (toIEW->commitInfo[tid].branchMispredict) {
                 ++branchMispredicts;
@@ -882,10 +823,11 @@ DefaultCommit<Impl>::commitInsts()
 {
     ////////////////////////////////////
     // Handle commit
-    // Note that commit will be handled prior to the ROB so that the ROB
-    // only tries to commit instructions it has in this current cycle, and
-    // not instructions it is writing in during this cycle.
-    // Can't commit and squash things at the same time...
+    // Note that commit will be handled prior to putting new
+    // instructions in the ROB so that the ROB only tries to commit
+    // instructions it has in this current cycle, and not instructions
+    // it is writing in during this cycle.  Can't commit and squash
+    // things at the same time...
     ////////////////////////////////////
 
     DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
@@ -894,51 +836,58 @@ DefaultCommit<Impl>::commitInsts()
 
     DynInstPtr head_inst;
 #if FULL_SYSTEM
-    if (commitStatus[0] == FetchTrapPending) {
+    // Not the best way to check if the front end is empty, but it should
+    // work.
+    // @todo: Try to avoid directly accessing fetch.
+    if (commitStatus[0] == FetchTrapPending && rob->isEmpty()) {
         DPRINTF(Commit, "Fault from fetch is pending.\n");
-        if (rob->isEmpty()) {
-            fetchTrapWait++;
-            if (fetchTrapWait > 10000000) {
-                panic("Fetch trap has been pending for a long time!");
-            }
-            if (fetchFaultTick > curTick) {
-                DPRINTF(Commit, "Not enough cycles since fault, fault will "
-                        "happen on %lli\n",
-                        fetchFaultTick);
-                cpu->activityThisCycle();
-                return;
-            } else if (iewStage->hasStoresToWB()) {
-                DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
-                        "they are completed. fetchTrapWait:%i\n",
-                        fetchTrapWait);
-                cpu->activityThisCycle();
-                return;
-            } else if (cpu->inPalMode(readPC())) {
-                DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
-                        fetchTrapWait);
-                return;
-            }
-            fetchTrapWait = 0;
-            DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
 
-            assert(!thread[0]->inSyscall);
+        fetchTrapWait++;
+        if (fetchTrapWait > 10000000) {
+            panic("Fetch trap has been pending for a long time!");
+        }
+        if (fetchFaultTick > curTick) {
+            DPRINTF(Commit, "Not enough cycles since fault, fault will "
+                    "happen on %lli\n",
+                    fetchFaultTick);
+            cpu->activityThisCycle();
+            return;
+        } else if (iewStage->hasStoresToWB()) {
+            DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
+                    "they are completed. fetchTrapWait:%i\n",
+                    fetchTrapWait);
+            cpu->activityThisCycle();
+            return;
+        } else if (cpu->inPalMode(readPC())) {
+            DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
+                    fetchTrapWait);
+            return;
+        } else if (fetchStage->getYoungestSN() > youngestSeqNum[0]) {
+            DPRINTF(Commit, "Waiting for front end to drain. fetchTrapWait:%i\n",
+                    fetchTrapWait);
+            return;
+        }
+        fetchTrapWait = 0;
+        DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
 
-            thread[0]->inSyscall = true;
+        assert(!thread[0]->inSyscall);
 
-            // Consider holding onto the trap and waiting until the trap event
-            // happens for this to be executed.
-            cpu->trap(fetchFault, 0);
+        thread[0]->inSyscall = true;
 
-            // Exit state update mode to avoid accidental updating.
-            thread[0]->inSyscall = false;
+        // Consider holding onto the trap and waiting until the trap event
+        // happens for this to be executed.
+        cpu->trap(fetchFault, 0);
 
-            commitStatus[0] = TrapPending;
-            // Set it up so that we squash next cycle
-            trapSquash[0] = true;
-            return;
-        }
+        // Exit state update mode to avoid accidental updating.
+        thread[0]->inSyscall = false;
+
+        commitStatus[0] = TrapPending;
+        // Set it up so that we squash next cycle
+        trapSquash[0] = true;
+        return;
     }
 #endif
+
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
     while (num_committed < commitWidth) {
@@ -956,16 +905,13 @@ DefaultCommit<Impl>::commitInsts()
         DPRINTF(Commit, "Trying to commit head instruction, [sn:%i] [tid:%i]\n",
                 head_inst->seqNum, tid);
 
-        // If the head instruction is squashed, it is ready to retire at any
-        // time.  However, we need to avoid updating any other state
-        // incorrectly if it's already been squashed.
+        // If the head instruction is squashed, it is ready to retire
+        // (be removed from the ROB) at any time.
         if (head_inst->isSquashed()) {
 
             DPRINTF(Commit, "Retiring squashed instruction from "
                     "ROB.\n");
 
-            // Tell ROB to retire head instruction.  This retires the head
-            // inst in the ROB without affecting any other stages.
             rob->retireHead(commit_thread);
 
             ++commitSquashedInsts;
@@ -989,7 +935,6 @@ DefaultCommit<Impl>::commitInsts()
             if (commit_success) {
                 ++num_committed;
 
-                // Record that the number of ROB entries has changed.
                 changedROBNumEntries[tid] = true;
 
                 // Set the doneSeqNum to the youngest committed instruction.
@@ -1009,8 +954,11 @@ DefaultCommit<Impl>::commitInsts()
                 int count = 0;
                 Addr oldpc;
                 do {
+                    // Debug statement.  Checks to make sure we're not
+                    // currently updating state while handling PC events.
                     if (count == 0)
-                        assert(!thread[tid]->inSyscall && !thread[tid]->trapPending);
+                        assert(!thread[tid]->inSyscall &&
+                               !thread[tid]->trapPending);
                     oldpc = PC[tid];
                     cpu->system->pcEventQueue.service(
                         thread[tid]->getXCProxy());
@@ -1034,7 +982,7 @@ DefaultCommit<Impl>::commitInsts()
     numCommittedDist.sample(num_committed);
 
     if (num_committed == commitWidth) {
-        commit_eligible[0]++;
+        commitEligible[0]++;
     }
 }
 
@@ -1042,13 +990,12 @@ template <class Impl>
 bool
 DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 {
-    // Make sure instruction is valid
     assert(head_inst);
 
     int tid = head_inst->threadNumber;
 
-    // If the instruction is not executed yet, then it is a non-speculative
-    // or store inst.  Signal backwards that it should be executed.
+    // If the instruction is not executed yet, then it will need extra
+    // handling.  Signal backwards that it should be executed.
     if (!head_inst->isExecuted()) {
         // Keep this number correct.  We have not yet actually executed
         // and committed this instruction.
@@ -1059,10 +1006,16 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         if (head_inst->isNonSpeculative() ||
             head_inst->isMemBarrier() ||
             head_inst->isWriteBarrier()) {
+
+            DPRINTF(Commit, "Encountered a barrier or non-speculative "
+                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
+                    head_inst->seqNum, head_inst->readPC());
+
 #if !FULL_SYSTEM
-            // Hack to make sure syscalls aren't executed until all stores
-            // write back their data.  This direct communication shouldn't
-            // be used for anything other than this.
+            // Hack to make sure syscalls/memory barriers/quiesces
+            // aren't executed until all stores write back their data.
+            // This direct communication shouldn't be used for
+            // anything other than this.
             if (inst_num > 0 || iewStage->hasStoresToWB())
 #else
             if ((head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
@@ -1074,11 +1027,6 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                 return false;
             }
 
-            DPRINTF(Commit, "Encountered a barrier or non-speculative "
-                    "instruction [sn:%lli] at the head of the ROB, PC %#x.\n",
-                    head_inst->seqNum, head_inst->readPC());
-
-            // Send back the non-speculative instruction's sequence number.
             toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
 
             // Change the instruction so it won't try to commit again until
@@ -1093,7 +1041,7 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
                     head_inst->seqNum, head_inst->readPC());
 
             // Send back the non-speculative instruction's sequence
-            // number.  Maybe just tell the lsq to re-execute the load.
+            // number.  Tell the lsq to re-execute the load.
             toIEW->commitInfo[tid].nonSpecSeqNum = head_inst->seqNum;
             toIEW->commitInfo[tid].uncached = true;
             toIEW->commitInfo[tid].uncachedLoad = head_inst;
@@ -1107,76 +1055,77 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         }
     }
 
-    // Now check if it's one of the special trap or barrier or
-    // serializing instructions.
-    if (head_inst->isThreadSync())/*  ||
-//        head_inst->isMemBarrier()  ||
-head_inst->isWriteBarrier())*/
-    {
+    if (head_inst->isThreadSync()) {
         // Not handled for now.
-        panic("Barrier instructions are not handled yet.\n");
+        panic("Thread sync instructions are not handled yet.\n");
     }
 
+    // Stores mark themselves as completed.
     if (!head_inst->isStore()) {
         head_inst->setCompleted();
     }
 
+    // Use checker prior to updating anything due to traps or PC
+    // based events.
+    if (cpu->checker) {
+        cpu->checker->tick(head_inst);
+    }
+
     // Check if the instruction caused a fault.  If so, trap.
     Fault inst_fault = head_inst->getFault();
 
     if (inst_fault != NoFault) {
-        if (!head_inst->isNop()) {
+        head_inst->setCompleted();
 #if FULL_SYSTEM
-            DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
-                    head_inst->seqNum, head_inst->readPC());
+        DPRINTF(Commit, "Inst [sn:%lli] PC %#x has a fault\n",
+                head_inst->seqNum, head_inst->readPC());
 
-            if (iewStage->hasStoresToWB()) {
-                DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
-                return false;
-            }
+        if (iewStage->hasStoresToWB() || inst_num > 0) {
+            DPRINTF(Commit, "Stores outstanding, fault must wait.\n");
+            return false;
+        }
 
-            assert(!thread[tid]->inSyscall);
+        if (cpu->checker && head_inst->isStore()) {
+            cpu->checker->tick(head_inst);
+        }
 
-            thread[tid]->inSyscall = true;
+        assert(!thread[tid]->inSyscall);
 
-            // Hack for now; DTB will sometimes need the machine instruction
-            // for when faults happen.  So we will set it here, prior to the
-            // DTB possibly needing it for this translation.
-            thread[tid]->setInst(
-                static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
+        // Mark that we're in state update mode so that the trap's
+        // execution doesn't generate extra squashes.
+        thread[tid]->inSyscall = true;
 
-            // Consider holding onto the trap and waiting until the trap event
-            // happens for this to be executed.
-            cpu->trap(inst_fault, tid);
+        // DTB will sometimes need the machine instruction for when
+        // faults happen.  So we will set it here, prior to the DTB
+        // possibly needing it for its fault.
+        thread[tid]->setInst(
+            static_cast<TheISA::MachInst>(head_inst->staticInst->machInst));
 
-            // Exit state update mode to avoid accidental updating.
-            thread[tid]->inSyscall = false;
+        // Execute the trap.  Although it's slightly unrealistic in
+        // terms of timing (as it doesn't wait for the full timing of
+        // the trap event to complete before updating state), it's
+        // needed to update the state as soon as possible.  This
+        // prevents external agents from changing any specific state
+        // that the trap need.
+        cpu->trap(inst_fault, tid);
 
-            commitStatus[tid] = TrapPending;
+        // Exit state update mode to avoid accidental updating.
+        thread[tid]->inSyscall = false;
 
-            // Generate trap squash event.
-            generateTrapEvent(tid);
+        commitStatus[tid] = TrapPending;
 
-            return false;
-#else // !FULL_SYSTEM
-            panic("fault (%d) detected @ PC %08p", inst_fault,
-                  head_inst->PC);
-#endif // FULL_SYSTEM
-        }
-    }
+        // Generate trap squash event.
+        generateTrapEvent(tid);
 
-    // Check if we're really ready to commit.  If not then return false.
-    // I'm pretty sure all instructions should be able to commit if they've
-    // reached this far.  For now leave this in as a check.
-    if (!rob->isHeadReady(tid)) {
-        panic("Unable to commit head instruction!\n");
         return false;
+#else // !FULL_SYSTEM
+        panic("fault (%d) detected @ PC %08p", inst_fault,
+              head_inst->PC);
+#endif // FULL_SYSTEM
     }
 
     updateComInstStats(head_inst);
 
-    // Now that the instruction is going to be committed, finalize its
-    // trace data.
     if (head_inst->traceData) {
         head_inst->traceData->setFetchSeq(head_inst->seqNum);
         head_inst->traceData->setCPSeq(thread[tid]->numInst);
@@ -1201,13 +1150,7 @@ template <class Impl>
 void
 DefaultCommit<Impl>::getInsts()
 {
-    //////////////////////////////////////
-    // Handle ROB functions
-    //////////////////////////////////////
-
-    // Read any renamed instructions and place them into the ROB.  Do this
-    // prior to squashing to avoid having instructions in the ROB that
-    // don't get squashed properly.
+    // Read any renamed instructions and place them into the ROB.
     int insts_to_process = min((int)renameWidth, fromRename->size);
 
     for (int inst_num = 0; inst_num < insts_to_process; ++inst_num)
@@ -1246,7 +1189,8 @@ DefaultCommit<Impl>::markCompletedInsts()
          ++inst_num)
     {
         if (!fromIEW->insts[inst_num]->isSquashed()) {
-            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, SN %i ready within ROB.\n",
+            DPRINTF(Commit, "[tid:%i]: Marking PC %#x, [sn:%lli] ready "
+                    "within ROB.\n",
                     fromIEW->insts[inst_num]->threadNumber,
                     fromIEW->insts[inst_num]->readPC(),
                     fromIEW->insts[inst_num]->seqNum);
@@ -1257,30 +1201,6 @@ DefaultCommit<Impl>::markCompletedInsts()
     }
 }
 
-template <class Impl>
-uint64_t
-DefaultCommit<Impl>::readPC()
-{
-    // @todo: Fix this single thread hack.
-    return PC[0];
-}
-
-template <class Impl>
-void
-DefaultCommit<Impl>::setSquashing(unsigned tid)
-{
-    if (_status == Inactive) {
-        DPRINTF(Activity, "Activating stage.\n");
-        _status = Active;
-        cpu->activateStage(FullCPU::CommitIdx);
-    }
-
-    if (commitStatus[tid] != ROBSquashing) {
-        commitStatus[tid] = ROBSquashing;
-        ++squashCounter;
-    }
-}
-
 template <class Impl>
 bool
 DefaultCommit<Impl>::robDoneSquashing()
@@ -1308,39 +1228,39 @@ DefaultCommit<Impl>::updateComInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch()) {
-        stat_com_swp[thread]++;
+        statComSwp[thread]++;
     } else {
-        stat_com_inst[thread]++;
+        statComInst[thread]++;
     }
 #else
-    stat_com_inst[thread]++;
+    statComInst[thread]++;
 #endif
 
     //
     //  Control Instructions
     //
     if (inst->isControl())
-        stat_com_branches[thread]++;
+        statComBranches[thread]++;
 
     //
     //  Memory references
     //
     if (inst->isMemRef()) {
-        stat_com_refs[thread]++;
+        statComRefs[thread]++;
 
         if (inst->isLoad()) {
-            stat_com_loads[thread]++;
+            statComLoads[thread]++;
         }
     }
 
     if (inst->isMemBarrier()) {
-        stat_com_membars[thread]++;
+        statComMembars[thread]++;
     }
 }
 
 ////////////////////////////////////////
 //                                    //
-//   SMT COMMIT POLICY MAITAINED HERE //
+//  SMT COMMIT POLICY MAINTAINED HERE //
 //                                    //
 ////////////////////////////////////////
 template <class Impl>
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 59308d6a9..9a46f2e7c 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -35,6 +35,7 @@
 #endif
 #include "sim/root.hh"
 
+#include "cpu/checker/cpu.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/o3/alpha_dyn_inst.hh"
@@ -76,7 +77,6 @@ FullO3CPU<Impl>::TickEvent::description()
     return "FullO3CPU tick event";
 }
 
-//Call constructor to all the pipeline stages here
 template <class Impl>
 FullO3CPU<Impl>::FullO3CPU(Params *params)
     : BaseFullCPU(params),
@@ -126,13 +126,25 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 //      pTable(params->pTable),
       mem(params->workload[0]->getMemory()),
 #endif // FULL_SYSTEM
-
+      switchCount(0),
       icacheInterface(params->icacheInterface),
       dcacheInterface(params->dcacheInterface),
-      deferRegistration(params->deferRegistration)
+      deferRegistration(params->deferRegistration),
+      numThreads(number_of_threads)
 {
     _status = Idle;
 
+    if (params->checker) {
+        BaseCPU *temp_checker = params->checker;
+        checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker);
+        checker->setMemory(mem);
+#if FULL_SYSTEM
+        checker->setSystem(params->system);
+#endif
+    } else {
+        checker = NULL;
+    }
+
 #if !FULL_SYSTEM
     thread.resize(number_of_threads);
     tids.resize(number_of_threads);
@@ -168,20 +180,18 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     commit.setIEWQueue(&iewQueue);
     commit.setRenameQueue(&renameQueue);
 
+    commit.setFetchStage(&fetch);
     commit.setIEWStage(&iew);
     rename.setIEWStage(&iew);
     rename.setCommitStage(&commit);
 
-    //Make Sure That this a Valid Architeture
-    //@todo: move this up in constructor
-    numThreads = number_of_threads;
-
 #if !FULL_SYSTEM
     int active_threads = params->workload.size();
 #else
     int active_threads = 1;
 #endif
 
+    //Make Sure That this a Valid Architeture
     assert(params->numPhysIntRegs   >= numThreads * TheISA::NumIntRegs);
     assert(params->numPhysFloatRegs >= numThreads * TheISA::NumFloatRegs);
 
@@ -357,7 +367,7 @@ FullO3CPU<Impl>::tick()
         cleanUpRemovedInsts();
     }
 
-    if (activityCount && !tickEvent.scheduled()) {
+    if (_status != SwitchedOut && activityCount && !tickEvent.scheduled()) {
         tickEvent.schedule(curTick + cycles(1));
     }
 
@@ -380,13 +390,7 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = true;
 
-
-    // Need to do a copy of the xc->regs into the CPU's regfile so
-    // that it can start properly.
-
     for (int tid=0; tid < number_of_threads; tid++) {
-        // Need to do a copy of the xc->regs into the CPU's regfile so
-        // that it can start properly.
 #if FULL_SYSTEM
         ExecContext *src_xc = execContexts[tid];
 #else
@@ -406,8 +410,7 @@ FullO3CPU<Impl>::init()
     for (int i = 0; i < number_of_threads; ++i)
         thread[i]->inSyscall = false;
 
-    // Probably should just make a call to all the stages to init stage,
-    // regardless of whether or not they need it.  Keeps it more independent.
+    // Initialize stages.
     fetch.initStage();
     iew.initStage();
     rename.initStage();
@@ -570,7 +573,6 @@ template <class Impl>
 void
 FullO3CPU<Impl>::activateContext(int tid, int delay)
 {
-
     // Needs to set each stage to running as well.
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
@@ -658,30 +660,46 @@ FullO3CPU<Impl>::haltContext(int tid)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::switchOut(Sampler *sampler)
+FullO3CPU<Impl>::switchOut(Sampler *_sampler)
 {
-//    panic("FullO3CPU does not have a switch out function.\n");
+    sampler = _sampler;
+    switchCount = 0;
     fetch.switchOut();
     decode.switchOut();
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+}
 
-    instList.clear();
-    while (!removeList.empty()) {
-        removeList.pop();
-    }
+template <class Impl>
+void
+FullO3CPU<Impl>::signalSwitched()
+{
+    if (++switchCount == 5) {
+        fetch.doSwitchOut();
+        rename.doSwitchOut();
+        commit.doSwitchOut();
+        instList.clear();
+        while (!removeList.empty()) {
+            removeList.pop();
+        }
 
-    if (tickEvent.scheduled())
-        tickEvent.squash();
-    sampler->signalSwitched();
-    _status = SwitchedOut;
+        if (checker)
+            checker->switchOut(sampler);
+
+        if (tickEvent.scheduled())
+            tickEvent.squash();
+        sampler->signalSwitched();
+        _status = SwitchedOut;
+    }
+    assert(switchCount <= 5);
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
+    // Flush out any old data from the activity buffers.
     for (int i = 0; i < 6; ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
@@ -733,13 +751,6 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
         tickEvent.schedule(curTick);
 }
 
-template <class Impl>
-InstSeqNum
-FullO3CPU<Impl>::getAndIncrementInstSeq()
-{
-    return globalSeqNum++;
-}
-
 template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readIntReg(int reg_idx)
@@ -982,14 +993,9 @@ FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid)
     while (inst_it != end_it) {
         assert(!instList.empty());
 
-        bool break_loop = (inst_it == instList.begin());
-
         squashInstIt(inst_it, tid);
 
         inst_it--;
-
-        if (break_loop)
-            break;
     }
 
     // If the ROB was empty, then we actually need to remove the first
@@ -1095,8 +1101,6 @@ FullO3CPU<Impl>::dumpInsts()
         inst_list_it++;
         ++num;
     }
-
-
 }
 
 template <class Impl>
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 621ddf541..789729e61 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -46,6 +46,8 @@
 #include "cpu/o3/thread_state.hh"
 #include "sim/process.hh"
 
+template <class>
+class Checker;
 class ExecContext;
 class MemInterface;
 class Process;
@@ -199,13 +201,16 @@ class FullO3CPU : public BaseFullCPU
      */
     void switchOut(Sampler *sampler);
 
+    void signalSwitched();
+
     /** Takes over from another CPU.
      *  @todo: Implement this.
      */
     void takeOverFrom(BaseCPU *oldCPU);
 
     /** Get the current instruction sequence number, and increment it. */
-    InstSeqNum getAndIncrementInstSeq();
+    InstSeqNum getAndIncrementInstSeq()
+    { return globalSeqNum++; }
 
 #if FULL_SYSTEM
     /** Check if this address is a valid instruction address. */
@@ -333,9 +338,9 @@ class FullO3CPU : public BaseFullCPU
      */
     std::queue<ListIt> removeList;
 
-#ifdef DEBUG
+//#ifdef DEBUG
     std::set<InstSeqNum> snList;
-#endif
+//#endif
 
     /** Records if instructions need to be removed this cycle due to being
      *  retired or squashed.
@@ -474,6 +479,8 @@ class FullO3CPU : public BaseFullCPU
     /** The global sequence number counter. */
     InstSeqNum globalSeqNum;
 
+    Checker<DynInstPtr> *checker;
+
 #if FULL_SYSTEM
     /** Pointer to the system. */
     System *system;
@@ -484,12 +491,16 @@ class FullO3CPU : public BaseFullCPU
     PhysicalMemory *physmem;
 #endif
 
-    // List of all ExecContexts.
-    std::vector<Thread *> thread;
-
     /** Pointer to memory. */
     FunctionalMemory *mem;
 
+    Sampler *sampler;
+
+    int switchCount;
+
+    // List of all ExecContexts.
+    std::vector<Thread *> thread;
+
 #if 0
     /** Page table pointer. */
     PageTable *pTable;
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index caa97067b..a419a8932 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -166,6 +166,7 @@ template <class Impl>
 void
 DefaultDecode<Impl>::switchOut()
 {
+    cpu->signalSwitched();
 }
 
 template <class Impl>
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index 6074831c6..b03d4afe3 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -165,6 +165,8 @@ class DefaultFetch
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     bool isSwitchedOut() { return switchedOut; }
@@ -371,6 +373,11 @@ class DefaultFetch
 
     bool switchedOut;
 
+  public:
+    InstSeqNum &getYoungestSN() { return youngestSN; }
+  private:
+    InstSeqNum youngestSN;
+
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 92f923c65..b4ff69d89 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -372,6 +372,13 @@ void
 DefaultFetch<Impl>::switchOut()
 {
     switchedOut = true;
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultFetch<Impl>::doSwitchOut()
+{
     branchPred.switchOut();
 }
 
@@ -474,7 +481,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     unsigned flags = 0;
 #endif // FULL_SYSTEM
 
-    if (interruptPending && flags == 0) {
+    if (interruptPending && flags == 0 || switchedOut) {
         // Hold off fetch from getting new instructions while an interrupt
         // is pending.
         return false;
@@ -508,7 +515,8 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     // instruction.
     if (fault == NoFault) {
 #if FULL_SYSTEM
-        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr)) {
+        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
+            memReq[tid]->flags & UNCACHEABLE) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
                     memReq[tid]->paddr);
@@ -625,8 +633,8 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
 template<class Impl>
 void
 DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
-                                    const InstSeqNum &seq_num,
-                                    unsigned tid)
+                                     const InstSeqNum &seq_num,
+                                     unsigned tid)
 {
     DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);
 
@@ -635,6 +643,7 @@ DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
     cpu->removeInstsUntil(seq_num, tid);
+    youngestSN = seq_num;
 }
 
 template<class Impl>
@@ -820,6 +829,7 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         // In any case, squash.
         squash(fromCommit->commitInfo[tid].nextPC,tid);
+        youngestSN = fromCommit->commitInfo[tid].doneSeqNum;
 
         // Also check if there's a mispredict that happened.
         if (fromCommit->commitInfo[tid].branchMispredict) {
@@ -999,6 +1009,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
 
+            youngestSN = inst_seq;
+
             // Make sure this is a valid index.
             assert(offset <= cacheBlkSize - instSize);
 
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index ae0ba6a21..72be25668 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -159,6 +159,8 @@ class DefaultIEW
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     bool isSwitchedOut() { return switchedOut; }
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 42d83ee72..cbd7396f7 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -55,7 +55,11 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
 
     //iewStage->ldstQueue.removeMSHR(inst->threadNumber,inst->seqNum);
 
-    if (inst->isSquashed() || iewStage->isSwitchedOut()) {
+    if (iewStage->isSwitchedOut()) {
+        inst = NULL;
+        return;
+    } else if (inst->isSquashed()) {
+        iewStage->wakeCPU();
         inst = NULL;
         return;
     }
@@ -440,8 +444,16 @@ DefaultIEW<Impl>::setPageTable(PageTable *pt_ptr)
 template <class Impl>
 void
 DefaultIEW<Impl>::switchOut()
+{
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultIEW<Impl>::doSwitchOut()
 {
     switchedOut = true;
+
     instQueue.switchOut();
     ldstQueue.switchOut();
     fuPool->switchOut();
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index 3bb9a81f8..dca808ac9 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "cpu/checker/cpu.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "base/str.hh"
 
@@ -690,6 +691,9 @@ LSQUnit<Impl>::writebackStores()
         }
         if (!(req->flags & LOCKED)) {
             storeQueue[storeWBIdx].inst->setCompleted();
+            if (cpu->checker) {
+                cpu->checker->tick(storeQueue[storeWBIdx].inst);
+            }
         }
 
         if (dcacheInterface) {
@@ -937,6 +941,11 @@ LSQUnit<Impl>::completeStore(int store_idx)
         stallingStoreIsn = 0;
         iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
     }
+
+    storeQueue[store_idx].inst->setCompleted();
+    if (cpu->checker) {
+        cpu->checker->tick(storeQueue[store_idx].inst);
+    }
 }
 
 template <class Impl>
diff --git a/cpu/o3/regfile.hh b/cpu/o3/regfile.hh
index 78674c32c..ed1238d36 100644
--- a/cpu/o3/regfile.hh
+++ b/cpu/o3/regfile.hh
@@ -200,7 +200,7 @@ class PhysRegFile
                                   unsigned thread_id)
     {
         return miscRegs[thread_id].readRegWithEffect(misc_reg, fault,
-                                                     cpu->xcProxies[thread_id]);
+                                                     cpu->xcBase(thread_id));
     }
 
     Fault setMiscReg(int misc_reg, const MiscReg &val, unsigned thread_id)
@@ -212,7 +212,7 @@ class PhysRegFile
                                unsigned thread_id)
     {
         return miscRegs[thread_id].setRegWithEffect(misc_reg, val,
-                                                    cpu->xcProxies[thread_id]);
+                                                    cpu->xcBase(thread_id));
     }
 
 #if FULL_SYSTEM
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index 4c5c46356..dd2cb0c18 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -155,6 +155,8 @@ class DefaultRename
 
     void switchOut();
 
+    void doSwitchOut();
+
     void takeOverFrom();
 
     /** Squashes all instructions in a thread. */
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index d41058deb..db4bb2ffe 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -261,6 +261,13 @@ DefaultRename<Impl>::setScoreboard(Scoreboard *_scoreboard)
 template <class Impl>
 void
 DefaultRename<Impl>::switchOut()
+{
+    cpu->signalSwitched();
+}
+
+template <class Impl>
+void
+DefaultRename<Impl>::doSwitchOut()
 {
     for (int i = 0; i < numThreads; i++) {
         typename list<RenameHistory>::iterator hb_it = historyBuffer[i].begin();
-- 
cgit v1.2.3


From 36581a534240c322e1fc28b8bd6e8f13f2b0fefd Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Wed, 17 May 2006 14:25:10 -0400
Subject: Faults generated at fetch are passed to the backend by creating a
 dummy nop instruction and giving it the fault.  This unifies front end faults
 and normal instruction faults.

cpu/checker/cpu.cc:
    Fixups for fetch fault being sent with the instruction.
cpu/o3/fetch_impl.hh:
cpu/ozone/front_end_impl.hh:
    Send any faults generated at fetch along with a fake nop instruction to the back end.  This avoids having to use direct communication to check if the entire front end has drained; it is naturally handled through the nop's fault being handled when it reaches the head of commit.
cpu/ozone/front_end.hh:
    Add extra status TrapPending.
cpu/ozone/lw_back_end_impl.hh:
    Fetch fault handled through a dummy nop carrying the fetch fault.

    Avoid putting Nops on the exeList.

--HG--
extra : convert_revision : 8d9899748b34c204763a49c48a9b5113864f5789
---
 cpu/o3/fetch_impl.hh | 107 ++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 70 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index b4ff69d89..523719945 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,22 +27,21 @@
  */
 
 #include "arch/isa_traits.hh"
-#include "sim/byteswap.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/o3/fetch.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
 #include "mem/mem_req.hh"
-
+#include "sim/byteswap.hh"
 #include "sim/root.hh"
 
 #if FULL_SYSTEM
+#include "arch/tlb.hh"
+#include "arch/vtophys.hh"
 #include "base/remote_gdb.hh"
 #include "mem/functional/memory_control.hh"
 #include "mem/functional/physical.hh"
 #include "sim/system.hh"
-#include "arch/tlb.hh"
-#include "arch/vtophys.hh"
 #else // !FULL_SYSTEM
 #include "mem/functional/functional.hh"
 #endif // FULL_SYSTEM
@@ -136,14 +135,7 @@ DefaultFetch<Impl>::DefaultFetch(Params *params)
 
         // Create a new memory request.
         memReq[tid] = NULL;
-//        memReq[tid] = new MemReq();
-/*
-        // Need a way of setting this correctly for parallel programs
-        // @todo: Figure out how to properly set asid vs thread_num.
-        memReq[tid]->asid = tid;
-        memReq[tid]->thread_num = tid;
-        memReq[tid]->data = new uint8_t[64];
-*/
+
         // Create space to store a cache line.
         cacheData[tid] = new uint8_t[cacheBlkSize];
 
@@ -261,10 +253,6 @@ DefaultFetch<Impl>::setCPU(FullCPU *cpu_ptr)
     DPRINTF(Fetch, "Setting the CPU pointer.\n");
     cpu = cpu_ptr;
 
-    // Set ExecContexts for Memory Requests
-//    for (int tid=0; tid < numThreads; tid++)
-//        memReq[tid]->xc = cpu->xcBase(tid);
-
     // Fetch needs to start fetching instructions at the very beginning,
     // so it must start up in active state.
     switchToActive();
@@ -362,9 +350,8 @@ DefaultFetch<Impl>::processCacheCompletion(MemReqPtr &req)
 
 //    memcpy(cacheData[tid], memReq[tid]->data, memReq[tid]->size);
 
-    // Reset the completion event to NULL.
+    // Reset the mem req to NULL.
     memReq[tid] = NULL;
-//    memReq[tid]->completionEvent = NULL;
 }
 
 template <class Impl>
@@ -468,10 +455,6 @@ template <class Impl>
 bool
 DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid)
 {
-    // Check if the instruction exists within the cache.
-    // If it does, then proceed on to read the instruction and the rest
-    // of the instructions in the cache line until either the end of the
-    // cache line or a predicted taken branch is encountered.
     Fault fault = NoFault;
 
 #if FULL_SYSTEM
@@ -509,7 +492,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 //#endif
 
     // In the case of faults, the fetch stage may need to stall and wait
-    // on what caused the fetch (ITB or Icache miss).
+    // for the ITB miss to be handled.
 
     // If translation was successful, attempt to read the first
     // instruction.
@@ -518,7 +501,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
         if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
             memReq[tid]->flags & UNCACHEABLE) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
-                    "misspeculating path!",
+                    "misspeculating path)!",
                     memReq[tid]->paddr);
             ret_fault = TheISA::genMachineCheckFault();
             return false;
@@ -587,44 +570,9 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
     if (fetchStatus[tid] == IcacheMissStall && icacheInterface) {
         DPRINTF(Fetch, "[tid:%i]: Squashing outstanding Icache miss.\n",
                 tid);
-//        icacheInterface->squash(tid);
-/*
-        if (memReq[tid]->completionEvent) {
-            if (memReq[tid]->completionEvent->scheduled()) {
-                memReq[tid]->completionEvent->squash();
-            } else {
-                delete memReq[tid]->completionEvent;
-                memReq[tid]->completionEvent = NULL;
-            }
-        }
-*/
         memReq[tid] = NULL;
     }
 
-    if (fetchStatus[tid] == TrapPending) {
-        // @todo: Hardcoded number here
-
-        // This is only effective if communication to and from commit
-        // is identical.  If it's faster to commit than it is from
-        // commit to here, then it causes problems.
-
-        bool found_fault = false;
-        for (int i = 0; i > -5; --i) {
-            if (fetchQueue->access(i)->fetchFault) {
-                DPRINTF(Fetch, "[tid:%i]: Fetch used to be in a trap, "
-                        "clearing it.\n",
-                        tid);
-                fetchQueue->access(i)->fetchFault = NoFault;
-                found_fault = true;
-            }
-        }
-        if (!found_fault) {
-            warn("%lli Fault from fetch not found in time buffer!",
-                 curTick);
-        }
-        toDecode->clearFetchFault = true;
-    }
-
     fetchStatus[tid] = Squashing;
 
     ++fetchSquashCycles;
@@ -643,7 +591,6 @@ DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
     // Tell the CPU to remove any instructions that are in flight between
     // fetch and decode.
     cpu->removeInstsUntil(seq_num, tid);
-    youngestSN = seq_num;
 }
 
 template<class Impl>
@@ -829,7 +776,6 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
 
         // In any case, squash.
         squash(fromCommit->commitInfo[tid].nextPC,tid);
-        youngestSN = fromCommit->commitInfo[tid].doneSeqNum;
 
         // Also check if there's a mispredict that happened.
         if (fromCommit->commitInfo[tid].branchMispredict) {
@@ -1009,8 +955,6 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
 
-            youngestSN = inst_seq;
-
             // Make sure this is a valid index.
             assert(offset <= cacheBlkSize - instSize);
 
@@ -1095,14 +1039,37 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         // This stage will not be able to continue until all the ROB
         // slots are empty, at which point the fault can be handled.
         // The only other way it can wake up is if a squash comes along
-        // and changes the PC.  Not sure how to handle that case...perhaps
-        // have it handled by the upper level CPU class which peeks into the
-        // time buffer and sees if a squash comes along, in which case it
-        // changes the status.
+        // and changes the PC.
 #if FULL_SYSTEM
+        assert(numInst != fetchWidth);
+        // Get a sequence number.
+        inst_seq = cpu->getAndIncrementInstSeq();
+        // We will use a nop in order to carry the fault.
+        ext_inst = TheISA::NoopMachInst;
+
+        // Create a new DynInst from the dummy nop.
+        DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
+                                             next_PC,
+                                             inst_seq, cpu);
+        instruction->setPredTarg(next_PC + instSize);
+        instruction->setThread(tid);
+
+        instruction->setASID(tid);
+
+        instruction->setState(cpu->thread[tid]);
+
+        instruction->traceData = NULL;
+
+        instruction->setInstListIt(cpu->addInst(instruction));
+
+        instruction->fault = fault;
+
+        toDecode->insts[numInst] = instruction;
+        toDecode->size++;
+
         // Tell the commit stage the fault we had.
-        toDecode->fetchFault = fault;
-        toDecode->fetchFaultSN = cpu->globalSeqNum;
+//        toDecode->fetchFault = fault;
+//        toDecode->fetchFaultSN = cpu->globalSeqNum;
 
         DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
-- 
cgit v1.2.3


From c7e7d07ec395156015e3baf52048c403d28a6442 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 14:27:46 -0400
Subject: Fixes for regression build errors.

--HG--
extra : convert_revision : 1f59c853cb0e327d7cf586021b5139f1242e4f28
---
 cpu/o3/alpha_cpu_impl.hh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 856fcb1c8..58b2b3548 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -171,8 +171,7 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     setCpuId(old_context->readCpuId());
 #if !FULL_SYSTEM
     thread->funcExeInst = old_context->readFuncExeInst();
-#endif
-
+#else
     EndQuiesceEvent *other_quiesce = old_context->getQuiesceEvent();
     if (other_quiesce) {
         // Point the quiesce event's XC at this XC so that it wakes up
@@ -184,6 +183,7 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     }
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
+#endif
 
     old_context->setStatus(ExecContext::Unallocated);
 
-- 
cgit v1.2.3


From c4a87f874a69535f70c0f6f2733ea716e32c70cf Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:37:52 -0400
Subject: Move activity tracking code into its own class.  Now the CPU no
 longer has to keep track of the activity tracking internals; it just calls
 advance() on the class and uses it to tell if it should deschedule itself.

SConscript:
    Split off activity/idling code into its own class to do the processing separately.
cpu/o3/alpha_cpu_builder.cc:
cpu/o3/alpha_params.hh:
    Activity stuff.  This is mostly for debugging and may be removed later on (or changed to enable/disable activity idling).
cpu/o3/cpu.cc:
    Move activity idling stuff mostly into its own class, so it no longer clutters this file.
cpu/o3/cpu.hh:
    Move activity idling stuff into its own class.
python/m5/objects/AlphaFullCPU.py:
    Add parameter for initial activity value.

--HG--
extra : convert_revision : f32f7cc03895dc07ab57ddba78c5402a1a8b0f1a
---
 cpu/o3/alpha_cpu_builder.cc |   3 +
 cpu/o3/alpha_params.hh      |   2 +
 cpu/o3/cpu.cc               | 167 ++++++++++++++------------------------------
 cpu/o3/cpu.hh               |  80 +++++++--------------
 4 files changed, 80 insertions(+), 172 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_cpu_builder.cc b/cpu/o3/alpha_cpu_builder.cc
index 0f9116d71..b0d812edc 100644
--- a/cpu/o3/alpha_cpu_builder.cc
+++ b/cpu/o3/alpha_cpu_builder.cc
@@ -48,6 +48,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     Param<int> clock;
     Param<int> numThreads;
+Param<int> activity;
 
 #if FULL_SYSTEM
 SimObjectParam<System *> system;
@@ -156,6 +157,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivAlphaFullCPU)
 
     INIT_PARAM(clock, "clock speed"),
     INIT_PARAM(numThreads, "number of HW thread contexts"),
+    INIT_PARAM_DFLT(activity, "Initial activity count", 0),
 
 #if FULL_SYSTEM
     INIT_PARAM(system, "System object"),
@@ -301,6 +303,7 @@ CREATE_SIM_OBJECT(DerivAlphaFullCPU)
 
     params->name = getInstanceName();
     params->numberOfThreads = actual_num_threads;
+    params->activity = activity;
 
 #if FULL_SYSTEM
     params->system = system;
diff --git a/cpu/o3/alpha_params.hh b/cpu/o3/alpha_params.hh
index b8ebae21e..e3acf2c05 100644
--- a/cpu/o3/alpha_params.hh
+++ b/cpu/o3/alpha_params.hh
@@ -64,6 +64,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
 
     BaseCPU *checker;
 
+    unsigned activity;
+
     //
     // Caches
     //
diff --git a/cpu/o3/cpu.cc b/cpu/o3/cpu.cc
index 9a46f2e7c..8d72bdc41 100644
--- a/cpu/o3/cpu.cc
+++ b/cpu/o3/cpu.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,8 +33,8 @@
 #else
 #include "sim/process.hh"
 #endif
-#include "sim/root.hh"
 
+#include "cpu/activity.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/exec_context.hh"
@@ -42,6 +42,7 @@
 #include "cpu/o3/alpha_impl.hh"
 #include "cpu/o3/cpu.hh"
 
+#include "sim/root.hh"
 #include "sim/stat_control.hh"
 
 using namespace std;
@@ -104,16 +105,15 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
                  TheISA::NumMiscRegs * number_of_threads,
                  TheISA::ZeroReg),
 
-      // What to pass to these time buffers?
       // For now just have these time buffers be pretty big.
-      // @todo: Make these time buffer sizes parameters.
+      // @todo: Make these time buffer sizes parameters or derived
+      // from latencies
       timeBuffer(5, 5),
       fetchQueue(5, 5),
       decodeQueue(5, 5),
       renameQueue(5, 5),
       iewQueue(5, 5),
-      activityBuffer(5, 0),
-      activityCount(0),
+      activityRec(NumStages, 10, params->activity),
 
       globalSeqNum(1),
 
@@ -150,9 +150,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
     tids.resize(number_of_threads);
 #endif
 
-    // The stages also need their CPU pointer setup.  However this must be
-    // done at the upper level CPU because they have pointers to the upper
-    // level CPU, and not this FullO3CPU.
+    // The stages also need their CPU pointer setup.  However this
+    // must be done at the upper level CPU because they have pointers
+    // to the upper level CPU, and not this FullO3CPU.
 
     // Set up Pointers to the activeThreads list for each stage
     fetch.setActiveThreads(&activeThreads);
@@ -207,11 +207,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
         commitRenameMap[tid].init(TheISA::NumIntRegs,
                                   params->numPhysIntRegs,
-                                  lreg_idx,                   //Index for Logical. Regs
+                                  lreg_idx,            //Index for Logical. Regs
 
                                   TheISA::NumFloatRegs,
                                   params->numPhysFloatRegs,
-                                  freg_idx,                   //Index for Float Regs
+                                  freg_idx,            //Index for Float Regs
 
                                   TheISA::NumMiscRegs,
 
@@ -223,11 +223,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
         renameMap[tid].init(TheISA::NumIntRegs,
                             params->numPhysIntRegs,
-                            lreg_idx,                   //Index for Logical. Regs
+                            lreg_idx,                  //Index for Logical. Regs
 
                             TheISA::NumFloatRegs,
                             params->numPhysFloatRegs,
-                            freg_idx,                   //Index for Float Regs
+                            freg_idx,                  //Index for Float Regs
 
                             TheISA::NumMiscRegs,
 
@@ -258,10 +258,6 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
     lastRunningCycle = curTick;
 
-    for (int i = 0; i < NumStages; ++i) {
-        stageActive[i] = false;
-    }
-
     contextSwitch = false;
 }
 
@@ -336,7 +332,7 @@ FullO3CPU<Impl>::tick()
 
     ++numCycles;
 
-    activity = false;
+//    activity = false;
 
     //Tick each of the stages
     fetch.tick();
@@ -361,14 +357,22 @@ FullO3CPU<Impl>::tick()
     renameQueue.advance();
     iewQueue.advance();
 
-    advanceActivityBuffer();
+    activityRec.advance();
 
     if (removeInstsThisCycle) {
         cleanUpRemovedInsts();
     }
 
-    if (_status != SwitchedOut && activityCount && !tickEvent.scheduled()) {
-        tickEvent.schedule(curTick + cycles(1));
+    if (!tickEvent.scheduled()) {
+        if (_status == SwitchedOut) {
+            // increment stat
+            lastRunningCycle = curTick;
+        } else if (!activityRec.active()) {
+            lastRunningCycle = curTick;
+            timesIdled++;
+        } else {
+            tickEvent.schedule(curTick + cycles(1));
+        }
     }
 
 #if !FULL_SYSTEM
@@ -592,7 +596,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
 
     // Be sure to signal that there's some activity so the CPU doesn't
     // deschedule itself.
-    activityThisCycle();
+    activityRec.activity();
     fetch.wakeFromQuiesce();
 
     _status = Running;
@@ -669,13 +673,18 @@ FullO3CPU<Impl>::switchOut(Sampler *_sampler)
     rename.switchOut();
     iew.switchOut();
     commit.switchOut();
+
+    // Wake the CPU and record activity so everything can drain out if
+    // the CPU is currently idle.
+    wakeCPU();
+    activityRec.activity();
 }
 
 template <class Impl>
 void
 FullO3CPU<Impl>::signalSwitched()
 {
-    if (++switchCount == 5) {
+    if (++switchCount == NumStages) {
         fetch.doSwitchOut();
         rename.doSwitchOut();
         commit.doSwitchOut();
@@ -699,18 +708,16 @@ template <class Impl>
 void
 FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 {
-    // Flush out any old data from the activity buffers.
-    for (int i = 0; i < 6; ++i) {
+    // Flush out any old data from the time buffers.
+    for (int i = 0; i < 10; ++i) {
         timeBuffer.advance();
         fetchQueue.advance();
         decodeQueue.advance();
         renameQueue.advance();
         iewQueue.advance();
-        activityBuffer.advance();
     }
 
-    activityCount = 0;
-    bzero(&stageActive, sizeof(stageActive));
+    activityRec.reset();
 
     BaseCPU::takeOverFrom(oldCPU);
 
@@ -722,23 +729,23 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
 
     assert(!tickEvent.scheduled());
 
-    // @todo: Figure out how to properly select the tid to put onto the active threads list.
+    // @todo: Figure out how to properly select the tid to put onto
+    // the active threads list.
     int tid = 0;
 
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
 
     if (isActive == activeThreads.end()) {
-        //May Need to Re-code this if the delay variable is the
-        //delay needed for thread to activate
+        //May Need to Re-code this if the delay variable is the delay
+        //needed for thread to activate
         DPRINTF(FullCPU, "Adding Thread %i to active threads list\n",
                 tid);
 
         activeThreads.push_back(tid);
     }
 
-    // Set all status's to active, schedule the
-    // CPU's tick event.
+    // Set all statuses to active, schedule the CPU's tick event.
     // @todo: Fix up statuses so this is handled properly
     for (int i = 0; i < execContexts.size(); ++i) {
         ExecContext *xc = execContexts[i];
@@ -850,10 +857,6 @@ template <class Impl>
 void
 FullO3CPU<Impl>::setArchIntReg(int reg_idx, uint64_t val, unsigned tid)
 {
-    if (reg_idx == TheISA::ZeroReg) {
-        warn("Setting r31 through ArchIntReg in CPU, cycle %i\n", curTick);
-    }
-
     PhysRegIndex phys_reg = commitRenameMap[tid].lookup(reg_idx);
 
     regFile.setIntReg(phys_reg, val);
@@ -1049,8 +1052,8 @@ FullO3CPU<Impl>::squashInstIt(const ListIt &instIt, const unsigned &tid)
         // Mark it as squashed.
         (*instIt)->setSquashed();
 
-        //@todo: Formulate a consistent method for deleting
-        //instructions from the instruction list
+        // @todo: Formulate a consistent method for deleting
+        // instructions from the instruction list
         // Remove the instruction from the list.
         removeList.push(instIt);
     }
@@ -1074,14 +1077,14 @@ FullO3CPU<Impl>::cleanUpRemovedInsts()
 
     removeInstsThisCycle = false;
 }
-
+/*
 template <class Impl>
 void
 FullO3CPU<Impl>::removeAllInsts()
 {
     instList.clear();
 }
-
+*/
 template <class Impl>
 void
 FullO3CPU<Impl>::dumpInsts()
@@ -1102,96 +1105,28 @@ FullO3CPU<Impl>::dumpInsts()
         ++num;
     }
 }
-
+/*
 template <class Impl>
 void
 FullO3CPU<Impl>::wakeDependents(DynInstPtr &inst)
 {
     iew.wakeDependents(inst);
 }
-
+*/
 template <class Impl>
 void
 FullO3CPU<Impl>::wakeCPU()
 {
-    if (activityCount || tickEvent.scheduled()) {
-        return;
-    }
-
-    idleCycles += curTick - lastRunningCycle;
-
-    tickEvent.schedule(curTick);
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::activityThisCycle()
-{
-    if (activityBuffer[0]) {
+    if (activityRec.active() || tickEvent.scheduled()) {
+        DPRINTF(Activity, "CPU already running.\n");
         return;
     }
 
-    activityBuffer[0] = true;
-    activity = true;
-    ++activityCount;
-
-    DPRINTF(Activity, "Activity: %i\n", activityCount);
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::advanceActivityBuffer()
-{
-    if (activityBuffer[-5]) {
-        --activityCount;
-
-        assert(activityCount >= 0);
-
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-
-        if (activityCount == 0) {
-            DPRINTF(FullCPU, "No activity left, going to idle!\n");
-            lastRunningCycle = curTick;
-            timesIdled++;
-        }
-    }
-
-    activityBuffer.advance();
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::activateStage(const StageIdx idx)
-{
-    if (!stageActive[idx]) {
-        ++activityCount;
-
-        stageActive[idx] = true;
-
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-    } else {
-        DPRINTF(Activity, "Stage %i already active.\n", idx);
-    }
-
-    // @todo: Number is hardcoded for now.  Replace with parameter.
-    assert(activityCount < 15);
-}
-
-template <class Impl>
-void
-FullO3CPU<Impl>::deactivateStage(const StageIdx idx)
-{
-    if (stageActive[idx]) {
-        --activityCount;
-
-        stageActive[idx] = false;
+    DPRINTF(Activity, "Waking up CPU\n");
 
-        DPRINTF(Activity, "Activity: %i\n", activityCount);
-    } else {
-        DPRINTF(Activity, "Stage %i already inactive.\n", idx);
-    }
+    idleCycles += (curTick - 1) - lastRunningCycle;
 
-    assert(activityCount >= 0);
+    tickEvent.schedule(curTick);
 }
 
 template <class Impl>
diff --git a/cpu/o3/cpu.hh b/cpu/o3/cpu.hh
index 789729e61..8db65d501 100644
--- a/cpu/o3/cpu.hh
+++ b/cpu/o3/cpu.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __CPU_O3_FULL_CPU_HH__
-#define __CPU_O3_FULL_CPU_HH__
+#ifndef __CPU_O3_CPU_HH__
+#define __CPU_O3_CPU_HH__
 
 #include <iostream>
 #include <list>
@@ -38,6 +38,7 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "config/full_system.hh"
+#include "cpu/activity.hh"
 #include "cpu/base.hh"
 #include "cpu/cpu_exec_context.hh"
 #include "cpu/o3/comm.hh"
@@ -70,7 +71,7 @@ template <class Impl>
 class FullO3CPU : public BaseFullCPU
 {
   public:
-    //Put typedefs from the Impl here.
+    // Typedefs from the Impl here.
     typedef typename Impl::CPUPol CPUPolicy;
     typedef typename Impl::Params Params;
     typedef typename Impl::DynInstPtr DynInstPtr;
@@ -191,20 +192,18 @@ class FullO3CPU : public BaseFullCPU
      *  Note: this is a virtual function. CPU-Specific
      *  functionality defined in derived classes
      */
-    virtual void syscall(int tid) {}
+    virtual void syscall(int tid) { panic("Unimplemented!"); }
 
     /** Check if there are any system calls pending. */
     void checkSyscalls();
 
     /** Switches out this CPU.
-     *  @todo: Implement this.
      */
     void switchOut(Sampler *sampler);
 
     void signalSwitched();
 
     /** Takes over from another CPU.
-     *  @todo: Implement this.
      */
     void takeOverFrom(BaseCPU *oldCPU);
 
@@ -299,12 +298,8 @@ class FullO3CPU : public BaseFullCPU
     /** Add Instructions to the CPU Remove List*/
     void addToRemoveList(DynInstPtr &inst);
 
-    /** Remove an instruction from the front of the list.  It is expected
-     *  that there are no instructions in front of it (that is, none are older
-     *  than the instruction being removed).  Used when retiring instructions.
-     *  @todo: Remove the argument to this function, and just have it remove
-     *  last instruction once it's verified that commit has the same ordering
-     *  as the instruction list.
+    /** Remove an instruction from the front end of the list.  There's
+     *  no restriction on location of the instruction.
      */
     void removeFrontInst(DynInstPtr &inst);
 
@@ -319,15 +314,15 @@ class FullO3CPU : public BaseFullCPU
     void cleanUpRemovedInsts();
 
     /** Remove all instructions from the list. */
-    void removeAllInsts();
+//    void removeAllInsts();
 
     void dumpInsts();
 
     /** Basically a wrapper function so that instructions executed at
-     *  commit can tell the instruction queue that they have completed.
-     *  Eventually this hack should be removed.
+     *  commit can tell the instruction queue that they have
+     *  completed.  Eventually this hack should be removed.
      */
-    void wakeDependents(DynInstPtr &inst);
+//    void wakeDependents(DynInstPtr &inst);
 
   public:
     /** List of all the instructions in flight. */
@@ -338,12 +333,12 @@ class FullO3CPU : public BaseFullCPU
      */
     std::queue<ListIt> removeList;
 
-//#ifdef DEBUG
+#ifdef DEBUG
     std::set<InstSeqNum> snList;
-//#endif
+#endif
 
-    /** Records if instructions need to be removed this cycle due to being
-     *  retired or squashed.
+    /** Records if instructions need to be removed this cycle due to
+     *  being retired or squashed.
      */
     bool removeInstsThisCycle;
 
@@ -425,46 +420,19 @@ class FullO3CPU : public BaseFullCPU
     /** The IEW stage's instruction queue. */
     TimeBuffer<IEWStruct> iewQueue;
 
-  private:
-    /** Time buffer that tracks if any cycles has active communication in them.
-     *  It should be as long as the longest communication latency in the system.
-     *  Each time any time buffer is written, the activity buffer should also
-     *  be written to. The activityBuffer is advanced along with all the other
-     *  time buffers, so it should always have a 1 somewhere in it only if there
-     *  is active communication in a time buffer.
-     */
-    TimeBuffer<bool> activityBuffer;
-
-    /** Tracks how many stages and cycles of time buffer have activity. Stages
-     *  increment this count when they switch to active, and decrement it when
-     *  they switch to inactive. Whenever a cycle that previously had no
-     *  information is written in the time buffer, this is incremented. When
-     *  a cycle that had information exits the time buffer due to age, this
-     *  count is decremented. When the count is 0, there is no activity in the
-     *  CPU, and it can be descheduled.
-     */
-    int activityCount;
+  public:
+    ActivityRecorder activityRec;
 
-    /** Records if there has been activity this cycle. */
-    bool activity;
+    void activityThisCycle() { activityRec.activity(); }
 
-    /** Records which stages are active/inactive. */
-    bool stageActive[NumStages];
+    void activateStage(const StageIdx idx)
+    { activityRec.activateStage(idx); }
+
+    void deactivateStage(const StageIdx idx)
+    { activityRec.deactivateStage(idx); }
 
-  public:
     /** Wakes the CPU, rescheduling the CPU if it's not already active. */
     void wakeCPU();
-    /** Records that there is activity this cycle. */
-    void activityThisCycle();
-    /** Advances the activity buffer, decrementing the activityCount if active
-     *  communication just left the time buffer, and descheduling the CPU if
-     *  there is no activity.
-     */
-    void advanceActivityBuffer();
-    /** Marks a stage as active. */
-    void activateStage(const StageIdx idx);
-    /** Deactivates a stage. */
-    void deactivateStage(const StageIdx idx);
 
     /** Gets a free thread id. Use if thread ids change across system. */
     int getFreeTid();
@@ -550,4 +518,4 @@ class FullO3CPU : public BaseFullCPU
     Stats::Formula totalIpc;
 };
 
-#endif
+#endif // __CPU_O3_CPU_HH__
-- 
cgit v1.2.3


From 5df3e61f168a5dd7d86ba2f81538539622d77bd2 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:44:03 -0400
Subject: IEW/IQ code cleanup and reorganization. Dependecy graph code moved
 into its own class. This requires the changes to the functional units, which
 is in the next check in.

cpu/o3/iew.hh:
cpu/o3/iew_impl.hh:
    IEW and IQ code cleanup and reorganization.
cpu/o3/inst_queue.cc:
    Dependency graph code moved into its own class now.
cpu/o3/inst_queue.hh:
    IEW/IQ code cleanup and reorganization.
    Dependecy graph code moved into its own class.
cpu/o3/inst_queue_impl.hh:
    IEW/IQ code cleanup and reorganization.
    Dependecy graph code moved into its own class.
    Issue loop cleaned up, with completion events for functional units now used more correctly (before they weren't used for multi-cycle ops with pipelined FU's).

--HG--
extra : convert_revision : 35e50192df6f71dc81d46a73fdd65f7ec07c10e4
---
 cpu/o3/dep_graph.hh       | 213 ++++++++++++++++++++
 cpu/o3/iew.hh             |  59 +++---
 cpu/o3/iew_impl.hh        | 152 ++++++---------
 cpu/o3/inst_queue.cc      |   4 -
 cpu/o3/inst_queue.hh      | 114 +++--------
 cpu/o3/inst_queue_impl.hh | 480 ++++++++++++++--------------------------------
 6 files changed, 469 insertions(+), 553 deletions(-)
 create mode 100644 cpu/o3/dep_graph.hh

(limited to 'cpu/o3')

diff --git a/cpu/o3/dep_graph.hh b/cpu/o3/dep_graph.hh
new file mode 100644
index 000000000..f8ae38da4
--- /dev/null
+++ b/cpu/o3/dep_graph.hh
@@ -0,0 +1,213 @@
+
+#ifndef __CPU_O3_DEP_GRAPH_HH__
+#define __CPU_O3_DEP_GRAPH_HH__
+
+#include "cpu/o3/comm.hh"
+
+template <class DynInstPtr>
+class DependencyEntry
+{
+  public:
+    DependencyEntry()
+        : inst(NULL), next(NULL)
+    { }
+
+    DynInstPtr inst;
+    //Might want to include data about what arch. register the
+    //dependence is waiting on.
+    DependencyEntry<DynInstPtr> *next;
+};
+
+template <class DynInstPtr>
+class DependencyGraph
+{
+  public:
+    typedef DependencyEntry<DynInstPtr> DepEntry;
+
+    DependencyGraph()
+        : numEntries(0), memAllocCounter(0), nodesTraversed(0), nodesRemoved(0)
+    { }
+
+    void resize(int num_entries);
+
+    void reset();
+
+    void insert(PhysRegIndex idx, DynInstPtr &new_inst);
+
+    void setInst(PhysRegIndex idx, DynInstPtr &new_inst)
+    { dependGraph[idx].inst = new_inst; }
+
+    void clearInst(PhysRegIndex idx)
+    { dependGraph[idx].inst = NULL; }
+
+    void remove(PhysRegIndex idx, DynInstPtr &inst_to_remove);
+
+    DynInstPtr pop(PhysRegIndex idx);
+
+    bool empty(PhysRegIndex idx) { return !dependGraph[idx].next; }
+
+    /** Debugging function to dump out the dependency graph.
+     */
+    void dump();
+
+  private:
+    /** Array of linked lists.  Each linked list is a list of all the
+     *  instructions that depend upon a given register.  The actual
+     *  register's index is used to index into the graph; ie all
+     *  instructions in flight that are dependent upon r34 will be
+     *  in the linked list of dependGraph[34].
+     */
+    DepEntry *dependGraph;
+
+    int numEntries;
+
+    // Debug variable, remove when done testing.
+    unsigned memAllocCounter;
+
+  public:
+    uint64_t nodesTraversed;
+    uint64_t nodesRemoved;
+};
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::resize(int num_entries)
+{
+    numEntries = num_entries;
+    dependGraph = new DepEntry[numEntries];
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::reset()
+{
+    // Clear the dependency graph
+    DepEntry *curr;
+    DepEntry *prev;
+
+    for (int i = 0; i < numEntries; ++i) {
+        curr = dependGraph[i].next;
+
+        while (curr) {
+            memAllocCounter--;
+
+            prev = curr;
+            curr = prev->next;
+            prev->inst = NULL;
+
+            delete prev;
+        }
+
+        if (dependGraph[i].inst) {
+            dependGraph[i].inst = NULL;
+        }
+
+        dependGraph[i].next = NULL;
+    }
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::insert(PhysRegIndex idx, DynInstPtr &new_inst)
+{
+    //Add this new, dependent instruction at the head of the dependency
+    //chain.
+
+    // First create the entry that will be added to the head of the
+    // dependency chain.
+    DepEntry *new_entry = new DepEntry;
+    new_entry->next = dependGraph[idx].next;
+    new_entry->inst = new_inst;
+
+    // Then actually add it to the chain.
+    dependGraph[idx].next = new_entry;
+
+    ++memAllocCounter;
+}
+
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::remove(PhysRegIndex idx,
+                                    DynInstPtr &inst_to_remove)
+{
+    DepEntry *prev = &dependGraph[idx];
+    DepEntry *curr = dependGraph[idx].next;
+
+    // Make sure curr isn't NULL.  Because this instruction is being
+    // removed from a dependency list, it must have been placed there at
+    // an earlier time.  The dependency chain should not be empty,
+    // unless the instruction dependent upon it is already ready.
+    if (curr == NULL) {
+        return;
+    }
+
+    nodesRemoved++;
+
+    // Find the instruction to remove within the dependency linked list.
+    while (curr->inst != inst_to_remove) {
+        prev = curr;
+        curr = curr->next;
+        nodesTraversed++;
+
+        assert(curr != NULL);
+    }
+
+    // Now remove this instruction from the list.
+    prev->next = curr->next;
+
+    --memAllocCounter;
+
+    // Could push this off to the destructor of DependencyEntry
+    curr->inst = NULL;
+
+    delete curr;
+}
+
+template <class DynInstPtr>
+DynInstPtr
+DependencyGraph<DynInstPtr>::pop(PhysRegIndex idx)
+{
+    DepEntry *node;
+    node = dependGraph[idx].next;
+    DynInstPtr inst = NULL;
+    if (node) {
+        inst = node->inst;
+        dependGraph[idx].next = node->next;
+        node->inst = NULL;
+        memAllocCounter--;
+        delete node;
+    }
+    return inst;
+}
+
+template <class DynInstPtr>
+void
+DependencyGraph<DynInstPtr>::dump()
+{
+    DepEntry *curr;
+
+    for (int i = 0; i < numEntries; ++i)
+    {
+        curr = &dependGraph[i];
+
+        if (curr->inst) {
+            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
+                    i, curr->inst->readPC(), curr->inst->seqNum);
+        } else {
+            cprintf("dependGraph[%i]: No producer. consumer: ", i);
+        }
+
+        while (curr->next != NULL) {
+            curr = curr->next;
+
+            cprintf("%#x [sn:%lli] ",
+                    curr->inst->readPC(), curr->inst->seqNum);
+        }
+
+        cprintf("\n");
+    }
+    cprintf("memAllocCounter: %i\n", memAllocCounter);
+}
+
+#endif // __CPU_O3_DEP_GRAPH_HH__
diff --git a/cpu/o3/iew.hh b/cpu/o3/iew.hh
index 72be25668..935320628 100644
--- a/cpu/o3/iew.hh
+++ b/cpu/o3/iew.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,20 +41,23 @@
 class FUPool;
 
 /**
- * DefaultIEW handles both single threaded and SMT IEW(issue/execute/writeback).
- * It handles the dispatching of instructions to the LSQ/IQ as part of the issue
- * stage, and has the IQ try to issue instructions each cycle. The execute
- * latency is actually tied into the issue latency to allow the IQ to be able to
+ * DefaultIEW handles both single threaded and SMT IEW
+ * (issue/execute/writeback).  It handles the dispatching of
+ * instructions to the LSQ/IQ as part of the issue stage, and has the
+ * IQ try to issue instructions each cycle. The execute latency is
+ * actually tied into the issue latency to allow the IQ to be able to
  * do back-to-back scheduling without having to speculatively schedule
- * instructions. This happens by having the IQ have access to the functional
- * units, and the IQ gets the execution latencies from the FUs when it issues
- * instructions. Instructions reach the execute stage on the last cycle of
- * their execution, which is when the IQ knows to wake up any dependent
- * instructions, allowing back to back scheduling. The execute portion of IEW
- * separates memory instructions from non-memory instructions, either telling
- * the LSQ to execute the instruction, or executing the instruction directly.
- * The writeback portion of IEW completes the instructions by waking up any
- * dependents, and marking the register ready on the scoreboard.
+ * instructions. This happens by having the IQ have access to the
+ * functional units, and the IQ gets the execution latencies from the
+ * FUs when it issues instructions. Instructions reach the execute
+ * stage on the last cycle of their execution, which is when the IQ
+ * knows to wake up any dependent instructions, allowing back to back
+ * scheduling. The execute portion of IEW separates memory
+ * instructions from non-memory instructions, either telling the LSQ
+ * to execute the instruction, or executing the instruction directly.
+ * The writeback portion of IEW completes the instructions by waking
+ * up any dependents, and marking the register ready on the
+ * scoreboard.
  */
 template<class Impl>
 class DefaultIEW
@@ -214,10 +217,8 @@ class DefaultIEW
     /** Tells CPU that the IEW stage is inactive and idle. */
     inline void deactivateStage();
 
-//#if !FULL_SYSTEM
     /** Returns if the LSQ has any stores to writeback. */
     bool hasStoresToWB() { return ldstQueue.hasStoresToWB(); }
-//#endif
 
   private:
     /** Sends commit proper information for a squash due to a branch
@@ -469,10 +470,10 @@ class DefaultIEW
     /** Stat for total number of mispredicted branches detected at execute. */
     Stats::Formula branchMispredicts;
 
-    Stats::Vector<> exe_swp;
-    Stats::Vector<> exe_nop;
-    Stats::Vector<> exe_refs;
-    Stats::Vector<> exe_branches;
+    Stats::Vector<> exeSwp;
+    Stats::Vector<> exeNop;
+    Stats::Vector<> exeRefs;
+    Stats::Vector<> exeBranches;
 
 //    Stats::Vector<> issued_ops;
 /*
@@ -481,20 +482,20 @@ class DefaultIEW
     Stats::Vector<> dist_unissued;
     Stats::Vector2d<> stat_issued_inst_type;
 */
-    Stats::Formula issue_rate;
+    Stats::Formula issueRate;
     Stats::Formula iewExecStoreInsts;
 //    Stats::Formula issue_op_rate;
 //    Stats::Formula fu_busy_rate;
 
     Stats::Vector<> iewInstsToCommit;
-    Stats::Vector<> writeback_count;
-    Stats::Vector<> producer_inst;
-    Stats::Vector<> consumer_inst;
-    Stats::Vector<> wb_penalized;
-
-    Stats::Formula wb_rate;
-    Stats::Formula wb_fanout;
-    Stats::Formula wb_penalized_rate;
+    Stats::Vector<> writebackCount;
+    Stats::Vector<> producerInst;
+    Stats::Vector<> consumerInst;
+    Stats::Vector<> wbPenalized;
+
+    Stats::Formula wbRate;
+    Stats::Formula wbFanout;
+    Stats::Formula wbPenalizedRate;
 };
 
 #endif // __CPU_O3_IEW_HH__
diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index cbd7396f7..59f4055a6 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,7 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     if (!inst->isExecuted()) {
         inst->setExecuted();
 
-        // Execute again to copy data to proper place.
+        // Complete access to copy data to proper place.
         if (inst->isStore()) {
             inst->completeAcc();
         }
@@ -78,7 +78,6 @@ DefaultIEW<Impl>::LdWritebackEvent::process()
     // Need to insert instruction into queue to commit
     iewStage->instToCommit(inst);
 
-    //wroteToTimeBuffer = true;
     iewStage->activityThisCycle();
 
     inst = NULL;
@@ -93,8 +92,7 @@ DefaultIEW<Impl>::LdWritebackEvent::description()
 
 template<class Impl>
 DefaultIEW<Impl>::DefaultIEW(Params *params)
-    : // Just make this time buffer really big for now
-    // @todo: Make this into a parameter.
+    : // @todo: Make this into a parameter.
       issueToExecQueue(5, 5),
       instQueue(params),
       ldstQueue(params),
@@ -108,7 +106,6 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
       numThreads(params->numberOfThreads),
       switchedOut(false)
 {
-    DPRINTF(IEW, "executeIntWidth: %i.\n", params->executeIntWidth);
     _status = Active;
     exeStatus = Running;
     wbStatus = Idle;
@@ -130,7 +127,6 @@ DefaultIEW<Impl>::DefaultIEW(Params *params)
 
     updateLSQNextCycle = false;
 
-    // @todo: Make into a parameter
     skidBufferMax = (3 * (renameToIEWDelay * params->renameWidth)) + issueWidth;
 }
 
@@ -149,8 +145,6 @@ DefaultIEW<Impl>::regStats()
 
     instQueue.regStats();
 
-    //ldstQueue.regStats();
-
     iewIdleCycles
         .name(name() + ".iewIdleCycles")
         .desc("Number of cycles IEW is idle");
@@ -167,8 +161,6 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewUnblockCycles")
         .desc("Number of cycles IEW is unblocking");
 
-//    iewWBInsts;
-
     iewDispatchedInsts
         .name(name() + ".iewDispatchedInsts")
         .desc("Number of instructions dispatched to IQ");
@@ -206,11 +198,7 @@ DefaultIEW<Impl>::regStats()
         .name(name() + ".iewExecLoadInsts")
         .desc("Number of load instructions executed")
         .flags(total);
-/*
-    iewExecStoreInsts
-        .name(name() + ".iewExecStoreInsts")
-        .desc("Number of store instructions executed");
-*/
+
     iewExecSquashedInsts
         .name(name() + ".iewExecSquashedInsts")
         .desc("Number of squashed instructions skipped in execute");
@@ -233,47 +221,47 @@ DefaultIEW<Impl>::regStats()
 
     branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect;
 
-    exe_swp
+    exeSwp
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:swp")
         .desc("number of swp insts executed")
         .flags(total)
         ;
 
-    exe_nop
+    exeNop
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:nop")
         .desc("number of nop insts executed")
         .flags(total)
         ;
 
-    exe_refs
+    exeRefs
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:refs")
         .desc("number of memory reference insts executed")
         .flags(total)
         ;
 
-    exe_branches
+    exeBranches
         .init(cpu->number_of_threads)
         .name(name() + ".EXEC:branches")
         .desc("Number of branches executed")
         .flags(total)
         ;
 
-    issue_rate
+    issueRate
         .name(name() + ".EXEC:rate")
         .desc("Inst execution rate")
         .flags(total)
         ;
-    issue_rate = iewExecutedInsts / cpu->numCycles;
+    issueRate = iewExecutedInsts / cpu->numCycles;
 
     iewExecStoreInsts
         .name(name() + ".EXEC:stores")
         .desc("Number of stores executed")
         .flags(total)
         ;
-    iewExecStoreInsts = exe_refs - iewExecLoadInsts;
+    iewExecStoreInsts = exeRefs - iewExecLoadInsts;
 /*
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
@@ -292,56 +280,56 @@ DefaultIEW<Impl>::regStats()
         .flags(total)
         ;
 
-    writeback_count
+    writebackCount
         .init(cpu->number_of_threads)
         .name(name() + ".WB:count")
         .desc("cumulative count of insts written-back")
         .flags(total)
         ;
 
-    producer_inst
+    producerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:producers")
         .desc("num instructions producing a value")
         .flags(total)
         ;
 
-    consumer_inst
+    consumerInst
         .init(cpu->number_of_threads)
         .name(name() + ".WB:consumers")
         .desc("num instructions consuming a value")
         .flags(total)
         ;
 
-    wb_penalized
+    wbPenalized
         .init(cpu->number_of_threads)
         .name(name() + ".WB:penalized")
         .desc("number of instrctions required to write to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate
+    wbPenalizedRate
         .name(name() + ".WB:penalized_rate")
         .desc ("fraction of instructions written-back that wrote to 'other' IQ")
         .flags(total)
         ;
 
-    wb_penalized_rate = wb_penalized / writeback_count;
+    wbPenalizedRate = wbPenalized / writebackCount;
 
-    wb_fanout
+    wbFanout
         .name(name() + ".WB:fanout")
         .desc("average fanout of values written-back")
         .flags(total)
         ;
 
-    wb_fanout = producer_inst / consumer_inst;
+    wbFanout = producerInst / consumerInst;
 
-    wb_rate
+    wbRate
         .name(name() + ".WB:rate")
         .desc("insts written-back per cycle")
         .flags(total)
         ;
-    wb_rate = writeback_count / cpu->numCycles;
+    wbRate = writebackCount / cpu->numCycles;
 }
 
 template<class Impl>
@@ -507,7 +495,7 @@ DefaultIEW<Impl>::squash(unsigned tid)
     instQueue.squash(tid);
 
     // Tell the LDSTQ to start squashing.
-    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum,tid);
+    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum, tid);
 
     updatedQueues = true;
 
@@ -543,18 +531,15 @@ DefaultIEW<Impl>::squashDueToBranch(DynInstPtr &inst, unsigned tid)
     DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, PC: %#x "
             "[sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
-    // Tell rename to squash through the time buffer.
     toCommit->squash[tid] = true;
     toCommit->squashedSeqNum[tid] = inst->seqNum;
     toCommit->mispredPC[tid] = inst->readPC();
     toCommit->nextPC[tid] = inst->readNextPC();
     toCommit->branchMispredict[tid] = true;
-    // Prediction was incorrect, so send back inverse.
     toCommit->branchTaken[tid] = inst->readNextPC() !=
         (inst->readPC() + sizeof(TheISA::MachInst));
 
     toCommit->includeSquashInst[tid] = false;
-    //toCommit->iewSquashNum[tid] = inst->seqNum;
 
     wroteToTimeBuffer = true;
 }
@@ -566,13 +551,11 @@ DefaultIEW<Impl>::squashDueToMemOrder(DynInstPtr &inst, unsigned tid)
     DPRINTF(IEW, "[tid:%i]: Squashing from a specific instruction, "
             "PC: %#x [sn:%i].\n", tid, inst->readPC(), inst->seqNum);
 
-    // Tell rename to squash through the time buffer.
     toCommit->squash[tid] = true;
     toCommit->squashedSeqNum[tid] = inst->seqNum;
     toCommit->nextPC[tid] = inst->readNextPC();
 
     toCommit->includeSquashInst[tid] = false;
-    //toCommit->iewSquashNum[tid] = inst->seqNum;
 
     wroteToTimeBuffer = true;
 }
@@ -611,7 +594,6 @@ DefaultIEW<Impl>::block(unsigned tid)
     // reprocessed when this stage unblocks.
     skidInsert(tid);
 
-    // Set the status to Blocked.
     dispatchStatus[tid] = Blocked;
 }
 
@@ -661,10 +643,7 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
     // to.  If there are free write ports at the time, then go ahead
     // and write the instruction to that time.  If there are not,
     // keep looking back to see where's the first time there's a
-    // free slot.  What happens if you run out of free spaces?
-    // For now naively assume that all instructions take one cycle.
-    // Otherwise would have to look into the time buffer based on the
-    // latency of the instruction.
+    // free slot.
     while ((*iewQueue)[wbCycle].insts[wbNumInst]) {
         ++wbNumInst;
         if (wbNumInst == issueWidth) {
@@ -918,10 +897,10 @@ void
 DefaultIEW<Impl>::sortInsts()
 {
     int insts_from_rename = fromRename->size;
-
+#ifdef DEBUG
     for (int i = 0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_rename; ++i) {
         insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]);
     }
@@ -1047,9 +1026,6 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
         // Be sure to mark these instructions as ready so that the
         // commit stage can go ahead and execute them, and mark
         // them as issued so the IQ doesn't reprocess them.
-        // -------------
-        // @TODO: What happens if the ldstqueue is full?
-        //        Do we process the other instructions?
 
         // Check for squashed instructions.
         if (inst->isSquashed()) {
@@ -1125,6 +1101,9 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             ++iewDispStoreInsts;
 
             if (inst->isNonSpeculative()) {
+                // Non-speculative stores (namely store conditionals)
+                // need to be set as "canCommit()" so that commit can
+                // process them when they reach the head of commit.
                 inst->setCanCommit();
                 instQueue.insertNonSpec(inst);
                 add_to_iq = false;
@@ -1137,6 +1116,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             toRename->iewInfo[tid].dispatchedToLSQ++;
 #if FULL_SYSTEM
         } else if (inst->isMemBarrier() || inst->isWriteBarrier()) {
+            // Same as non-speculative stores.
             inst->setCanCommit();
             instQueue.insertBarrier(inst);
             add_to_iq = false;
@@ -1145,7 +1125,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             DPRINTF(IEW, "[tid:%i]: Issue: Nonspeculative instruction "
                     "encountered, skipping.\n", tid);
 
-            // Same hack as with stores.
+            // Same as non-speculative stores.
             inst->setCanCommit();
 
             // Specifically insert it as nonspeculative.
@@ -1162,9 +1142,9 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             inst->setExecuted();
             inst->setCanCommit();
 
-            instQueue.advanceTail(inst);
+            instQueue.recordProducer(inst);
 
-            exe_nop[tid]++;
+            exeNop[tid]++;
 
             add_to_iq = false;
         } else if (inst->isExecuted()) {
@@ -1175,7 +1155,7 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
             inst->setIssued();
             inst->setCanCommit();
 
-            instQueue.advanceTail(inst);
+            instQueue.recordProducer(inst);
 
             add_to_iq = false;
         } else {
@@ -1237,7 +1217,6 @@ template <class Impl>
 void
 DefaultIEW<Impl>::executeInsts()
 {
-    //bool fetch_redirect[(*activeThreads).size()];
     wbNumInst = 0;
     wbCycle = 0;
 
@@ -1254,20 +1233,17 @@ DefaultIEW<Impl>::executeInsts()
 
     // Execute/writeback any instructions that are available.
     int inst_num = 0;
-    for ( ; inst_num < issueWidth &&  /* Haven't exceeded issue bandwidth */
-              fromIssue->insts[inst_num];
-         ++inst_num) {
+    for ( ; inst_num < issueWidth && fromIssue->insts[inst_num];
+          ++inst_num) {
 
         DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
-        // Get instruction from issue's queue.
         DynInstPtr inst = fromIssue->insts[inst_num];
 
         DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
                 inst->readPC(), inst->threadNumber,inst->seqNum);
 
         // Check if the instruction is squashed; if so then skip it
-        // and don't count it towards the FU usage.
         if (inst->isSquashed()) {
             DPRINTF(IEW, "Execute: Instruction was squashed.\n");
 
@@ -1299,22 +1275,19 @@ DefaultIEW<Impl>::executeInsts()
                 // Loads will mark themselves as executed, and their writeback
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
-
-//                ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore(inst);
 
-//                ++iewExecStoreInsts;
-
                 // If the store had a fault then it may not have a mem req
                 if (inst->req && !(inst->req->flags & LOCKED)) {
                     inst->setExecuted();
 
                     instToCommit(inst);
                 }
-                // Store conditionals will mark themselves as executed, and
-                // their writeback event will add the instruction to the queue
-                // to commit.
+
+                // Store conditionals will mark themselves as
+                // executed, and their writeback event will add the
+                // instruction to the queue to commit.
             } else {
                 panic("Unexpected memory type!\n");
             }
@@ -1329,10 +1302,9 @@ DefaultIEW<Impl>::executeInsts()
 
         updateExeInstStats(inst);
 
-        // Check if branch was correct.  This check happens after the
-        // instruction is added to the queue because even if the branch
-        // is mispredicted, the branch instruction itself is still valid.
-        // Only handle this if there hasn't already been something that
+        // Check if branch prediction was correct, if not then we need
+        // to tell commit to squash in flight instructions.  Only
+        // handle this if there hasn't already been something that
         // redirects fetch in this group of instructions.
 
         // This probably needs to prioritize the redirects if a different
@@ -1360,7 +1332,8 @@ DefaultIEW<Impl>::executeInsts()
             } else if (ldstQueue.violation(tid)) {
                 fetchRedirect[tid] = true;
 
-                // Get the DynInst that caused the violation.  Note that this
+                // If there was an ordering violation, then get the
+                // DynInst that caused the violation.  Note that this
                 // clears the violation signal.
                 DynInstPtr violator;
                 violator = ldstQueue.getMemDepViolator(tid);
@@ -1409,13 +1382,11 @@ template <class Impl>
 void
 DefaultIEW<Impl>::writebackInsts()
 {
-    // Loop through the head of the time buffer and wake any dependents.
-    // These instructions are about to write back.  In the simple model
-    // this loop can really happen within the previous loop, but when
-    // instructions have actual latencies, this loop must be separate.
-    // Also mark scoreboard that this instruction is finally complete.
-    // Either have IEW have direct access to rename map, or have this as
-    // part of backwards communication.
+    // Loop through the head of the time buffer and wake any
+    // dependents.  These instructions are about to write back.  Also
+    // mark scoreboard that this instruction is finally complete.
+    // Either have IEW have direct access to scoreboard, or have this
+    // as part of backwards communication.
     for (int inst_num = 0; inst_num < issueWidth &&
              toCommit->insts[inst_num]; inst_num++) {
         DynInstPtr inst = toCommit->insts[inst_num];
@@ -1441,9 +1412,9 @@ DefaultIEW<Impl>::writebackInsts()
                 scoreboard->setReg(inst->renamedDestRegIdx(i));
             }
 
-            producer_inst[tid]++;
-            consumer_inst[tid]+= dependents;
-            writeback_count[tid]++;
+            producerInst[tid]++;
+            consumerInst[tid]+= dependents;
+            writebackCount[tid]++;
         }
     }
 }
@@ -1452,8 +1423,6 @@ template<class Impl>
 void
 DefaultIEW<Impl>::tick()
 {
-    // Try to fill up issue queue with as many instructions as bandwidth
-    // allows.
     wbNumInst = 0;
     wbCycle = 0;
 
@@ -1462,9 +1431,12 @@ DefaultIEW<Impl>::tick()
 
     sortInsts();
 
+    // Free function units marked as being freed this cycle.
+    fuPool->processFreeUnits();
+
     list<unsigned>::iterator threads = (*activeThreads).begin();
 
-    // Check stall and squash signals.
+    // Check stall and squash signals, dispatch any instructions.
     while (threads != (*activeThreads).end()) {
            unsigned tid = *threads++;
 
@@ -1472,7 +1444,6 @@ DefaultIEW<Impl>::tick()
 
         checkSignalsAndUpdate(tid);
         dispatch(tid);
-
     }
 
     if (exeStatus != Squashing) {
@@ -1502,9 +1473,6 @@ DefaultIEW<Impl>::tick()
     // Writeback any stores using any leftover bandwidth.
     ldstQueue.writebackStores();
 
-    // Free function units marked as being freed this cycle.
-    fuPool->processFreeUnits();
-
     // Check the committed load/store signals to see if there's a load
     // or store to commit.  Also check if it's being told to execute a
     // nonspeculative instruction.
@@ -1557,8 +1525,6 @@ DefaultIEW<Impl>::tick()
 
         DPRINTF(IEW, "[tid:%i], Dispatch dispatched %i instructions.\n",
                 tid, toRename->iewInfo[tid].dispatched);
-
-        //thread_queue.pop();
     }
 
     DPRINTF(IEW, "IQ has %i free entries (Can schedule: %i).  "
@@ -1585,7 +1551,7 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //
 #ifdef TARGET_ALPHA
     if (inst->isDataPrefetch())
-        exe_swp[thread_number]++;
+        exeSwp[thread_number]++;
     else
         iewExecutedInsts++;
 #else
@@ -1596,13 +1562,13 @@ DefaultIEW<Impl>::updateExeInstStats(DynInstPtr &inst)
     //  Control operations
     //
     if (inst->isControl())
-        exe_branches[thread_number]++;
+        exeBranches[thread_number]++;
 
     //
     //  Memory operations
     //
     if (inst->isMemRef()) {
-        exe_refs[thread_number]++;
+        exeRefs[thread_number]++;
 
         if (inst->isLoad()) {
             iewExecLoadInsts[thread_number]++;
diff --git a/cpu/o3/inst_queue.cc b/cpu/o3/inst_queue.cc
index 2ff2282b4..95ae2b699 100644
--- a/cpu/o3/inst_queue.cc
+++ b/cpu/o3/inst_queue.cc
@@ -32,7 +32,3 @@
 
 // Force instantiation of InstructionQueue.
 template class InstructionQueue<AlphaSimpleImpl>;
-
-template<>
-unsigned
-InstructionQueue<AlphaSimpleImpl>::DependencyEntry::mem_alloc_counter = 0;
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 982294b4f..6bdf4ddc2 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,6 +37,7 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dep_graph.hh"
 #include "encumbered/cpu/full/op_class.hh"
 #include "sim/host.hh"
 
@@ -91,6 +92,8 @@ class InstructionQueue
         /** Pointer back to the instruction queue. */
         InstructionQueue<Impl> *iqPtr;
 
+        bool freeFU;
+
       public:
         /** Construct a FU completion event. */
         FUCompletion(DynInstPtr &_inst, int fu_idx,
@@ -98,6 +101,7 @@ class InstructionQueue
 
         virtual void process();
         virtual const char *description();
+        void setFreeFU() { freeFU = true; }
     };
 
     /** Constructs an IQ. */
@@ -114,8 +118,6 @@ class InstructionQueue
 
     void resetState();
 
-    void resetDependencyGraph();
-
     /** Sets CPU pointer. */
     void setCPU(FullCPU *_cpu) { cpu = _cpu; }
 
@@ -170,11 +172,11 @@ class InstructionQueue
     void insertBarrier(DynInstPtr &barr_inst);
 
     /**
-     * Advances the tail of the IQ, used if an instruction is not added to the
-     * IQ for scheduling.
-     * @todo: Rename this function.
+     * Records the instruction as the producer of a register without
+     * adding it to the rest of the IQ.
      */
-    void advanceTail(DynInstPtr &inst);
+    void recordProducer(DynInstPtr &inst)
+    { addToProducers(inst); }
 
     /** Process FU completion event. */
     void processFUCompletion(DynInstPtr &inst, int fu_idx);
@@ -224,9 +226,6 @@ class InstructionQueue
     /** Returns the number of used entries for a thread. */
     unsigned getCount(unsigned tid) { return count[tid]; };
 
-    /** Updates the number of free entries. */
-    void updateFreeEntries(int num) { freeEntries += num; }
-
     /** Debug function to print all instructions. */
     void printInsts();
 
@@ -286,15 +285,6 @@ class InstructionQueue
         }
     };
 
-    /**
-     * Struct for an IQ entry. It includes the instruction and an iterator
-     * to the instruction's spot in the IQ.
-     */
-    struct IQEntry {
-        DynInstPtr inst;
-        ListIt iqIt;
-    };
-
     typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
     ReadyInstQueue;
 
@@ -309,7 +299,6 @@ class InstructionQueue
      *  inside of DynInst), when these instructions are woken up only
      *  the sequence number will be available.  Thus it is most efficient to be
      *  able to search by the sequence number alone.
-     *  @todo: Maybe change this to a priority queue per thread.
      */
     std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
 
@@ -324,6 +313,9 @@ class InstructionQueue
     /** List that contains the age order of the oldest instruction of each
      *  ready queue.  Used to select the oldest instruction available
      *  among op classes.
+     *  @todo: Might be better to just move these entries around instead
+     *  of creating new ones every time the position changes due to an
+     *  instruction issuing.  Not sure std::list supports this.
      */
     std::list<ListOrderEntry> listOrder;
 
@@ -346,6 +338,8 @@ class InstructionQueue
      */
     void moveToYoungerInst(ListOrderIt age_order_it);
 
+    DependencyGraph<DynInstPtr> dependGraph;
+
     //////////////////////////////////////
     // Various parameters
     //////////////////////////////////////
@@ -397,57 +391,9 @@ class InstructionQueue
 
     bool switchedOut;
 
-    //////////////////////////////////
-    // Variables needed for squashing
-    //////////////////////////////////
-
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum[Impl::MaxThreads];
 
-    /** Iterator that points to the last instruction that has been squashed.
-     *  This will not be valid unless the IQ is in the process of squashing.
-     */
-    ListIt squashIt[Impl::MaxThreads];
-
-    ///////////////////////////////////
-    // Dependency graph stuff
-    ///////////////////////////////////
-
-    class DependencyEntry
-    {
-      public:
-        DependencyEntry()
-            : inst(NULL), next(NULL)
-        { }
-
-        DynInstPtr inst;
-        //Might want to include data about what arch. register the
-        //dependence is waiting on.
-        DependencyEntry *next;
-
-        //This function, and perhaps this whole class, stand out a little
-        //bit as they don't fit a classification well.  I want access
-        //to the underlying structure of the linked list, yet at
-        //the same time it feels like this should be something abstracted
-        //away.  So for now it will sit here, within the IQ, until
-        //a better implementation is decided upon.
-        // This function probably shouldn't be within the entry...
-        void insert(DynInstPtr &new_inst);
-
-        void remove(DynInstPtr &inst_to_remove);
-
-        // Debug variable, remove when done testing.
-        static unsigned mem_alloc_counter;
-    };
-
-    /** Array of linked lists.  Each linked list is a list of all the
-     *  instructions that depend upon a given register.  The actual
-     *  register's index is used to index into the graph; ie all
-     *  instructions in flight that are dependent upon r34 will be
-     *  in the linked list of dependGraph[34].
-     */
-    DependencyEntry *dependGraph;
-
     /** A cache of the recently woken registers.  It is 1 if the register
      *  has been woken up recently, and 0 if the register has been added
      *  to the dependency graph and has not yet received its value.  It
@@ -456,11 +402,11 @@ class InstructionQueue
      */
     std::vector<bool> regScoreboard;
 
-    /** Adds an instruction to the dependency graph, as a producer. */
+    /** Adds an instruction to the dependency graph, as a consumer. */
     bool addToDependents(DynInstPtr &new_inst);
 
-    /** Adds an instruction to the dependency graph, as a consumer. */
-    void createDependency(DynInstPtr &new_inst);
+    /** Adds an instruction to the dependency graph, as a producer. */
+    void addToProducers(DynInstPtr &new_inst);
 
     /** Moves an instruction to the ready queue if it is ready. */
     void addIfReady(DynInstPtr &inst);
@@ -471,10 +417,6 @@ class InstructionQueue
      */
     int countInsts();
 
-    /** Debugging function to dump out the dependency graph.
-     */
-    void dumpDependGraph();
-
     /** Debugging function to dump all the list sizes, as well as print
      *  out the list of nonspeculative instructions.  Should not be used
      *  in any other capacity, but it has no harmful sideaffects.
@@ -490,20 +432,16 @@ class InstructionQueue
     Stats::Scalar<> iqInstsAdded;
     /** Stat for number of non-speculative instructions added. */
     Stats::Scalar<> iqNonSpecInstsAdded;
-//    Stats::Scalar<> iqIntInstsAdded;
+
     Stats::Scalar<> iqInstsIssued;
     /** Stat for number of integer instructions issued. */
     Stats::Scalar<> iqIntInstsIssued;
-//    Stats::Scalar<> iqFloatInstsAdded;
     /** Stat for number of floating point instructions issued. */
     Stats::Scalar<> iqFloatInstsIssued;
-//    Stats::Scalar<> iqBranchInstsAdded;
     /** Stat for number of branch instructions issued. */
     Stats::Scalar<> iqBranchInstsIssued;
-//    Stats::Scalar<> iqMemInstsAdded;
     /** Stat for number of memory instructions issued. */
     Stats::Scalar<> iqMemInstsIssued;
-//    Stats::Scalar<> iqMiscInstsAdded;
     /** Stat for number of miscellaneous instructions issued. */
     Stats::Scalar<> iqMiscInstsIssued;
     /** Stat for number of squashed instructions that were ready to issue. */
@@ -518,20 +456,20 @@ class InstructionQueue
      */
     Stats::Scalar<> iqSquashedNonSpecRemoved;
 
-    Stats::VectorDistribution<> queue_res_dist;
-    Stats::Distribution<> n_issued_dist;
-    Stats::VectorDistribution<> issue_delay_dist;
+    Stats::VectorDistribution<> queueResDist;
+    Stats::Distribution<> numIssuedDist;
+    Stats::VectorDistribution<> issueDelayDist;
 
-    Stats::Vector<> stat_fu_busy;
+    Stats::Vector<> statFuBusy;
 //    Stats::Vector<> dist_unissued;
-    Stats::Vector2d<> stat_issued_inst_type;
+    Stats::Vector2d<> statIssuedInstType;
 
-    Stats::Formula issue_rate;
+    Stats::Formula issueRate;
 //    Stats::Formula issue_stores;
 //    Stats::Formula issue_op_rate;
-    Stats::Vector<> fu_busy;  //cumulative fu busy
+    Stats::Vector<> fuBusy;  //cumulative fu busy
 
-    Stats::Formula fu_busy_rate;
+    Stats::Formula fuBusyRate;
 };
 
 #endif //__CPU_O3_INST_QUEUE_HH__
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index 0d9cc09f3..ed57ac257 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,14 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// Todo:
-// Current ordering allows for 0 cycle added-to-scheduled.  Could maybe fake
-// it; either do in reverse order, or have added instructions put into a
-// different ready queue that, in scheduleRreadyInsts(), gets put onto the
-// normal ready queue.  This would however give only a one cycle delay,
-// but probably is more flexible to actually add in a delay parameter than
-// just running it backwards.
-
 #include <limits>
 #include <vector>
 
@@ -49,7 +41,7 @@ InstructionQueue<Impl>::FUCompletion::FUCompletion(DynInstPtr &_inst,
                                                    int fu_idx,
                                                    InstructionQueue<Impl> *iq_ptr)
     : Event(&mainEventQueue, Stat_Event_Pri),
-      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr)
+      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr), freeFU(false)
 {
     this->setFlags(Event::AutoDelete);
 }
@@ -58,7 +50,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::FUCompletion::process()
 {
-    iqPtr->processFUCompletion(inst, fuIdx);
+    iqPtr->processFUCompletion(inst, freeFU ? fuIdx : -1);
     inst = NULL;
 }
 
@@ -93,14 +85,7 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 
     //Create an entry for each physical register within the
     //dependency graph.
-    dependGraph = new DependencyEntry[numPhysRegs];
-
-    // Initialize all the head pointers to point to NULL, and all the
-    // entries as unready.
-    for (int i = 0; i < numPhysRegs; ++i) {
-        dependGraph[i].next = NULL;
-        dependGraph[i].inst = NULL;
-    }
+    dependGraph.resize(numPhysRegs);
 
     // Resize the register scoreboard.
     regScoreboard.resize(numPhysRegs);
@@ -165,10 +150,9 @@ InstructionQueue<Impl>::InstructionQueue(Params *params)
 template <class Impl>
 InstructionQueue<Impl>::~InstructionQueue()
 {
-    resetDependencyGraph();
-    assert(DependencyEntry::mem_alloc_counter == 0);
-
-    delete [] dependGraph;
+    dependGraph.reset();
+    cprintf("Nodes traversed: %i, removed: %i\n",
+            dependGraph.nodesTraversed, dependGraph.nodesRemoved);
 }
 
 template <class Impl>
@@ -193,8 +177,6 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of non-speculative instructions added to the IQ")
         .prereq(iqNonSpecInstsAdded);
 
-//    iqIntInstsAdded;
-
     iqInstsIssued
         .name(name() + ".iqInstsIssued")
         .desc("Number of instructions issued")
@@ -205,29 +187,21 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of integer instructions issued")
         .prereq(iqIntInstsIssued);
 
-//    iqFloatInstsAdded;
-
     iqFloatInstsIssued
         .name(name() + ".iqFloatInstsIssued")
         .desc("Number of float instructions issued")
         .prereq(iqFloatInstsIssued);
 
-//    iqBranchInstsAdded;
-
     iqBranchInstsIssued
         .name(name() + ".iqBranchInstsIssued")
         .desc("Number of branch instructions issued")
         .prereq(iqBranchInstsIssued);
 
-//    iqMemInstsAdded;
-
     iqMemInstsIssued
         .name(name() + ".iqMemInstsIssued")
         .desc("Number of memory instructions issued")
         .prereq(iqMemInstsIssued);
 
-//    iqMiscInstsAdded;
-
     iqMiscInstsIssued
         .name(name() + ".iqMiscInstsIssued")
         .desc("Number of miscellaneous instructions issued")
@@ -255,16 +229,16 @@ InstructionQueue<Impl>::regStats()
         .desc("Number of squashed non-spec instructions that were removed")
         .prereq(iqSquashedNonSpecRemoved);
 
-    queue_res_dist
+    queueResDist
         .init(Num_OpClasses, 0, 99, 2)
         .name(name() + ".IQ:residence:")
         .desc("cycles from dispatch to issue")
         .flags(total | pdf | cdf )
         ;
     for (int i = 0; i < Num_OpClasses; ++i) {
-        queue_res_dist.subname(i, opClassStrings[i]);
+        queueResDist.subname(i, opClassStrings[i]);
     }
-    n_issued_dist
+    numIssuedDist
         .init(0,totalWidth,1)
         .name(name() + ".ISSUE:issued_per_cycle")
         .desc("Number of insts issued each cycle")
@@ -281,19 +255,19 @@ InstructionQueue<Impl>::regStats()
         dist_unissued.subname(i, unissued_names[i]);
     }
 */
-    stat_issued_inst_type
+    statIssuedInstType
         .init(numThreads,Num_OpClasses)
         .name(name() + ".ISSUE:FU_type")
         .desc("Type of FU issued")
         .flags(total | pdf | dist)
         ;
-    stat_issued_inst_type.ysubnames(opClassStrings);
+    statIssuedInstType.ysubnames(opClassStrings);
 
     //
     //  How long did instructions for a particular FU type wait prior to issue
     //
 
-    issue_delay_dist
+    issueDelayDist
         .init(Num_OpClasses,0,99,2)
         .name(name() + ".ISSUE:")
         .desc("cycles from operands ready to issue")
@@ -303,15 +277,15 @@ InstructionQueue<Impl>::regStats()
     for (int i=0; i<Num_OpClasses; ++i) {
         stringstream subname;
         subname << opClassStrings[i] << "_delay";
-        issue_delay_dist.subname(i, subname.str());
+        issueDelayDist.subname(i, subname.str());
     }
 
-    issue_rate
+    issueRate
         .name(name() + ".ISSUE:rate")
         .desc("Inst issue rate")
         .flags(total)
         ;
-    issue_rate = iqInstsIssued / cpu->numCycles;
+    issueRate = iqInstsIssued / cpu->numCycles;
 /*
     issue_stores
         .name(name() + ".ISSUE:stores")
@@ -328,29 +302,29 @@ InstructionQueue<Impl>::regStats()
         ;
     issue_op_rate = issued_ops / numCycles;
 */
-    stat_fu_busy
+    statFuBusy
         .init(Num_OpClasses)
         .name(name() + ".ISSUE:fu_full")
         .desc("attempts to use FU when none available")
         .flags(pdf | dist)
         ;
     for (int i=0; i < Num_OpClasses; ++i) {
-        stat_fu_busy.subname(i, opClassStrings[i]);
+        statFuBusy.subname(i, opClassStrings[i]);
     }
 
-    fu_busy
+    fuBusy
         .init(numThreads)
         .name(name() + ".ISSUE:fu_busy_cnt")
         .desc("FU busy when requested")
         .flags(total)
         ;
 
-    fu_busy_rate
+    fuBusyRate
         .name(name() + ".ISSUE:fu_busy_rate")
         .desc("FU busy rate (busy events/executed inst)")
         .flags(total)
         ;
-    fu_busy_rate = fu_busy / iqInstsIssued;
+    fuBusyRate = fuBusy / iqInstsIssued;
 
     for ( int i=0; i < numThreads; i++) {
         // Tell mem dependence unit to reg stats as well.
@@ -394,35 +368,6 @@ InstructionQueue<Impl>::resetState()
     listOrder.clear();
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::resetDependencyGraph()
-{
-    // Clear the dependency graph
-    DependencyEntry *curr;
-    DependencyEntry *prev;
-
-    for (int i = 0; i < numPhysRegs; ++i) {
-        curr = dependGraph[i].next;
-
-        while (curr) {
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
-
-            delete prev;
-        }
-
-        if (dependGraph[i].inst) {
-            dependGraph[i].inst = NULL;
-        }
-
-        dependGraph[i].next = NULL;
-    }
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::setActiveThreads(list<unsigned> *at_ptr)
@@ -454,7 +399,7 @@ void
 InstructionQueue<Impl>::switchOut()
 {
     resetState();
-    resetDependencyGraph();
+    dependGraph.reset();
     switchedOut = true;
     for (int i = 0; i < numThreads; ++i) {
         memDepUnit[i].switchOut();
@@ -562,20 +507,15 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // Make sure the instruction is valid
     assert(new_inst);
 
-    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
-            new_inst->readPC());
+    DPRINTF(IQ, "Adding instruction [sn:%lli] PC %#x to the IQ.\n",
+            new_inst->seqNum, new_inst->readPC());
 
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
     assert(freeEntries != 0);
 
     instList[new_inst->threadNumber].push_back(new_inst);
 
-    // Decrease the number of free entries.
     --freeEntries;
 
-    //Mark Instruction as in IQ
     new_inst->setInIQ();
 
     // Look through its source registers (physical regs), and mark any
@@ -584,21 +524,16 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
 
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(new_inst);
+    addToProducers(new_inst);
 
-    // If it's a memory instruction, add it to the memory dependency
-    // unit.
     if (new_inst->isMemRef()) {
         memDepUnit[new_inst->threadNumber].insert(new_inst);
     } else {
-        // If the instruction is ready then add it to the ready list.
         addIfReady(new_inst);
     }
 
     ++iqInstsAdded;
 
-
-    //Update Thread IQ Count
     count[new_inst->threadNumber]++;
 
     assert(freeEntries == (numEntries - countInsts()));
@@ -611,30 +546,25 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
     // @todo: Clean up this code; can do it by setting inst as unable
     // to issue, then calling normal insert on the inst.
 
-    // Make sure the instruction is valid
     assert(new_inst);
 
     nonSpecInsts[new_inst->seqNum] = new_inst;
 
-    DPRINTF(IQ, "Adding instruction PC %#x to the IQ.\n",
-            new_inst->readPC());
+    DPRINTF(IQ, "Adding non-speculative instruction [sn:%lli] PC %#x "
+            "to the IQ.\n",
+            new_inst->seqNum, new_inst->readPC());
 
-    // Check if there are any free entries.  Panic if there are none.
-    // Might want to have this return a fault in the future instead of
-    // panicing.
     assert(freeEntries != 0);
 
     instList[new_inst->threadNumber].push_back(new_inst);
 
-    // Decrease the number of free entries.
     --freeEntries;
 
-    //Mark Instruction as in IQ
     new_inst->setInIQ();
 
     // Have this instruction set itself as the producer of its destination
     // register(s).
-    createDependency(new_inst);
+    addToProducers(new_inst);
 
     // If it's a memory instruction, add it to the memory dependency
     // unit.
@@ -644,7 +574,6 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &new_inst)
 
     ++iqNonSpecInstsAdded;
 
-    //Update Thread IQ Count
     count[new_inst->threadNumber]++;
 
     assert(freeEntries == (numEntries - countInsts()));
@@ -659,15 +588,6 @@ InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
     insertNonSpec(barr_inst);
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
-{
-    // Have this instruction set itself as the producer of its destination
-    // register(s).
-    createDependency(inst);
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::addToOrderList(OpClass op_class)
@@ -733,8 +653,15 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
 
     iewStage->wakeCPU();
 
-    fuPool->freeUnit(fu_idx);
+    if (fu_idx > -1)
+        fuPool->freeUnitNextCycle(fu_idx);
 
+    // @todo: Ensure that these FU Completions happen at the beginning
+    // of a cycle, otherwise they could add too many instructions to
+    // the queue.
+    // @todo: This could break if there's multiple multi-cycle ops
+    // finishing on this cycle.  Maybe implement something like
+    // instToCommit in iew_impl.hh.
     int &size = issueToExecuteQueue->access(0)->size;
 
     issueToExecuteQueue->access(0)->insts[size++] = inst;
@@ -752,20 +679,6 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
-    // Will need to reorder the list if either a queue is not on the list,
-    // or it has an older instruction than last time.
-    for (int i = 0; i < Num_OpClasses; ++i) {
-        if (!readyInsts[i].empty()) {
-            if (!queueOnList[i]) {
-                addToOrderList(OpClass(i));
-            } else if (readyInsts[i].top()->seqNum  <
-                       (*readyIt[i]).oldestInst) {
-                listOrder.erase(readyIt[i]);
-                addToOrderList(OpClass(i));
-            }
-        }
-    }
-
     // Have iterator to head of the list
     // While I haven't exceeded bandwidth or reached the end of the list,
     // Try to get a FU that can do what this op needs.
@@ -779,7 +692,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     int total_issued = 0;
     int exec_queue_slot = i2e_info->size;
 
-    while (exec_queue_slot < totalWidth && order_it != order_end_it) {
+    while (exec_queue_slot < totalWidth && total_issued < totalWidth &&
+           order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
         assert(!readyInsts[op_class].empty());
@@ -805,70 +719,47 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             continue;
         }
 
-        int idx = fuPool->getUnit(op_class);
-
+        int idx = -2;
+        int op_latency = 1;
         int tid = issuing_inst->threadNumber;
 
-        if (idx == -2) {
-            assert(op_class == No_OpClass);
-
-            i2e_info->insts[exec_queue_slot++] = issuing_inst;
-            i2e_info->size++;
-
-            DPRINTF(IQ, "Thread %i: Issuing instruction PC that needs no FU"
-                    " %#x [sn:%lli]\n",
-                    tid, issuing_inst->readPC(),
-                    issuing_inst->seqNum);
-
-            readyInsts[op_class].pop();
-
-            if (!readyInsts[op_class].empty()) {
-                moveToYoungerInst(order_it);
-            } else {
-                readyIt[op_class] = listOrder.end();
-                queueOnList[op_class] = false;
-            }
-
-            issuing_inst->setIssued();
-            ++total_issued;
+        if (op_class != No_OpClass) {
+            idx = fuPool->getUnit(op_class);
 
-            if (!issuing_inst->isMemRef()) {
-                // Memory instructions can not be freed from the IQ until they
-                // complete.
-                ++freeEntries;
-                count[tid]--;
-                issuing_inst->removeInIQ();
-            } else {
-                memDepUnit[tid].issue(issuing_inst);
+            if (idx > -1) {
+                op_latency = fuPool->getOpLatency(op_class);
             }
+        }
 
-            listOrder.erase(order_it++);
-
-            stat_issued_inst_type[tid][op_class]++;
-        } else if (idx != -1) {
-            int op_latency = fuPool->getOpLatency(op_class);
-
+        if (idx == -2 || idx != -1) {
             if (op_latency == 1) {
                 i2e_info->insts[exec_queue_slot++] = issuing_inst;
                 i2e_info->size++;
 
-                // Add the FU onto the list of FU's to be freed next cycle.
-                fuPool->freeUnit(idx);
+                // Add the FU onto the list of FU's to be freed next
+                // cycle if we used one.
+                if (idx >= 0)
+                    fuPool->freeUnitNextCycle(idx);
             } else {
                 int issue_latency = fuPool->getIssueLatency(op_class);
+                // Generate completion event for the FU
+                FUCompletion *execution = new FUCompletion(issuing_inst,
+                                                           idx, this);
 
-                if (issue_latency > 1) {
-                    // Generate completion event for the FU
-                    FUCompletion *execution = new FUCompletion(issuing_inst,
-                                                               idx, this);
+                execution->schedule(curTick + cpu->cycles(issue_latency - 1));
 
-                    execution->schedule(curTick + cpu->cycles(issue_latency - 1));
+                // @todo: Enforce that issue_latency == 1 or op_latency
+                if (issue_latency > 1) {
+                    execution->setFreeFU();
                 } else {
-                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
-                    i2e_info->size++;
+                    // @todo: Not sure I'm accounting for the
+                    // multi-cycle op in a pipelined FU properly, or
+                    // the number of instructions issued in one cycle.
+//                    i2e_info->insts[exec_queue_slot++] = issuing_inst;
+//                    i2e_info->size++;
 
                     // Add the FU onto the list of FU's to be freed next cycle.
-                    fuPool->freeUnit(idx);
+                    fuPool->freeUnitNextCycle(idx);
                 }
             }
 
@@ -900,15 +791,16 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             }
 
             listOrder.erase(order_it++);
-            stat_issued_inst_type[tid][op_class]++;
+            statIssuedInstType[tid][op_class]++;
         } else {
-            stat_fu_busy[op_class]++;
-            fu_busy[tid]++;
+            statFuBusy[op_class]++;
+            fuBusy[tid]++;
             ++order_it;
         }
     }
 
-    n_issued_dist.sample(total_issued);
+    numIssuedDist.sample(total_issued);
+    iqInstsIssued+= total_issued;
 
     if (total_issued) {
         cpu->activityThisCycle();
@@ -930,10 +822,8 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 
     unsigned tid = (*inst_it).second->threadNumber;
 
-    // Mark this instruction as ready to issue.
     (*inst_it).second->setCanIssue();
 
-    // Now schedule the instruction.
     if (!(*inst_it).second->isMemRef()) {
         addIfReady((*inst_it).second);
     } else {
@@ -949,7 +839,6 @@ template <class Impl>
 void
 InstructionQueue<Impl>::commit(const InstSeqNum &inst, unsigned tid)
 {
-    /*Need to go through each thread??*/
     DPRINTF(IQ, "[tid:%i]: Committing instructions older than [sn:%i]\n",
             tid,inst);
 
@@ -973,18 +862,13 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
     DPRINTF(IQ, "Waking dependents of completed instruction.\n");
 
     assert(!completed_inst->isSquashed());
-    // Look at the physical destination register of the DynInst
-    // and look it up on the dependency graph.  Then mark as ready
-    // any instructions within the instruction queue.
-    DependencyEntry *curr;
-    DependencyEntry *prev;
 
     // Tell the memory dependence unit to wake any dependents on this
     // instruction if it is a memory instruction.  Also complete the memory
-    // instruction at this point since we know it executed fine.
-    // @todo: Might want to rename "completeMemInst" to
-    // something that indicates that it won't need to be replayed, and call
-    // this earlier.  Might not be a big deal.
+    // instruction at this point since we know it executed without issues.
+    // @todo: Might want to rename "completeMemInst" to something that
+    // indicates that it won't need to be replayed, and call this
+    // earlier.  Might not be a big deal.
     if (completed_inst->isMemRef()) {
         memDepUnit[completed_inst->threadNumber].wakeDependents(completed_inst);
         completeMemInst(completed_inst);
@@ -1010,39 +894,31 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
         DPRINTF(IQ, "Waking any dependents on register %i.\n",
                 (int) dest_reg);
 
-        //Maybe abstract this part into a function.
-        //Go through the dependency chain, marking the registers as ready
-        //within the waiting instructions.
-
-        curr = dependGraph[dest_reg].next;
+        //Go through the dependency chain, marking the registers as
+        //ready within the waiting instructions.
+        DynInstPtr dep_inst = dependGraph.pop(dest_reg);
 
-        while (curr) {
+        while (dep_inst) {
             DPRINTF(IQ, "Waking up a dependent instruction, PC%#x.\n",
-                    curr->inst->readPC());
+                    dep_inst->readPC());
 
             // Might want to give more information to the instruction
-            // so that it knows which of its source registers is ready.
-            // However that would mean that the dependency graph entries
-            // would need to hold the src_reg_idx.
-            curr->inst->markSrcRegReady();
+            // so that it knows which of its source registers is
+            // ready.  However that would mean that the dependency
+            // graph entries would need to hold the src_reg_idx.
+            dep_inst->markSrcRegReady();
 
-            addIfReady(curr->inst);
+            addIfReady(dep_inst);
 
-            DependencyEntry::mem_alloc_counter--;
-
-            prev = curr;
-            curr = prev->next;
-            prev->inst = NULL;
+            dep_inst = dependGraph.pop(dest_reg);
 
             ++dependents;
-
-            delete prev;
         }
 
-        // Reset the head node now that all of its dependents have been woken
-        // up.
-        dependGraph[dest_reg].next = NULL;
-        dependGraph[dest_reg].inst = NULL;
+        // Reset the head node now that all of its dependents have
+        // been woken up.
+        assert(dependGraph.empty(dest_reg));
+        dependGraph.clearInst(dest_reg);
 
         // Mark the scoreboard as having that register ready.
         regScoreboard[dest_reg] = true;
@@ -1058,6 +934,16 @@ InstructionQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst)
 
     readyInsts[op_class].push(ready_inst);
 
+    // Will need to reorder the list if either a queue is not on the list,
+    // or it has an older instruction than last time.
+    if (!queueOnList[op_class]) {
+        addToOrderList(op_class);
+    } else if (readyInsts[op_class].top()->seqNum  <
+               (*readyIt[op_class]).oldestInst) {
+        listOrder.erase(readyIt[op_class]);
+        addToOrderList(op_class);
+    }
+
     DPRINTF(IQ, "Instruction is ready to issue, putting it onto "
             "the ready list, PC %#x opclass:%i [sn:%lli].\n",
             ready_inst->readPC(), op_class, ready_inst->seqNum);
@@ -1114,10 +1000,6 @@ InstructionQueue<Impl>::squash(unsigned tid)
     // time buffer.
     squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
 
-    // Setup the squash iterator to point to the tail.
-    squashIt[tid] = instList[tid].end();
-    --squashIt[tid];
-
     // Call doSquash if there are insts in the IQ
     if (count[tid] > 0) {
         doSquash(tid);
@@ -1131,24 +1013,25 @@ template <class Impl>
 void
 InstructionQueue<Impl>::doSquash(unsigned tid)
 {
-    // Make sure the squashed sequence number is valid.
-//    assert(squashedSeqNum[tid] != 0);
+    // Start at the tail.
+    ListIt squash_it = instList[tid].end();
+    --squash_it;
 
     DPRINTF(IQ, "[tid:%i]: Squashing until sequence number %i!\n",
             tid, squashedSeqNum[tid]);
 
     // Squash any instructions younger than the squashed sequence number
     // given.
-    while (squashIt[tid] != instList[tid].end() &&
-           (*squashIt[tid])->seqNum > squashedSeqNum[tid]) {
+    while (squash_it != instList[tid].end() &&
+           (*squash_it)->seqNum > squashedSeqNum[tid]) {
 
-        DynInstPtr squashed_inst = (*squashIt[tid]);
+        DynInstPtr squashed_inst = (*squash_it);
 
         // Only handle the instruction if it actually is in the IQ and
         // hasn't already been squashed in the IQ.
         if (squashed_inst->threadNumber != tid ||
             squashed_inst->isSquashedInIQ()) {
-            --squashIt[tid];
+            --squash_it;
             continue;
         }
 
@@ -1168,27 +1051,23 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
                     PhysRegIndex src_reg =
                         squashed_inst->renamedSrcRegIdx(src_reg_idx);
 
-                    // Only remove it from the dependency graph if it was
-                    // placed there in the first place.
-                    // HACK: This assumes that instructions woken up from the
-                    // dependency chain aren't informed that a specific src
-                    // register has become ready.  This may not always be true
-                    // in the future.
-                    // Instead of doing a linked list traversal, we can just
-                    // remove these squashed instructions either at issue time,
-                    // or when the register is overwritten.  The only downside
-                    // to this is it leaves more room for error.
+                    // Only remove it from the dependency graph if it
+                    // was placed there in the first place.
+
+                    // Instead of doing a linked list traversal, we
+                    // can just remove these squashed instructions
+                    // either at issue time, or when the register is
+                    // overwritten.  The only downside to this is it
+                    // leaves more room for error.
 
                     if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
                         src_reg < numPhysRegs) {
-                        dependGraph[src_reg].remove(squashed_inst);
+                        dependGraph.remove(src_reg, squashed_inst);
                     }
 
 
                     ++iqSquashedOperandsExamined;
                 }
-
-                // Might want to remove producers as well.
             } else {
                 NonSpecMapIt ns_inst_it =
                     nonSpecInsts.find(squashed_inst->seqNum);
@@ -1217,74 +1096,16 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
 
             ++freeEntries;
 
-            if (numThreads > 1) {
-                DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
-                        "squashed.\n",
-                        tid, squashed_inst->seqNum, squashed_inst->readPC());
-            } else {
-                DPRINTF(IQ, "Instruction [sn:%lli] PC %#x squashed.\n",
-                        squashed_inst->seqNum, squashed_inst->readPC());
-            }
+            DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x "
+                    "squashed.\n",
+                    tid, squashed_inst->seqNum, squashed_inst->readPC());
         }
 
-        instList[tid].erase(squashIt[tid]--);
+        instList[tid].erase(squash_it--);
         ++iqSquashedInstsExamined;
     }
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)
-{
-    //Add this new, dependent instruction at the head of the dependency
-    //chain.
-
-    // First create the entry that will be added to the head of the
-    // dependency chain.
-    DependencyEntry *new_entry = new DependencyEntry;
-    new_entry->next = this->next;
-    new_entry->inst = new_inst;
-
-    // Then actually add it to the chain.
-    this->next = new_entry;
-
-    ++mem_alloc_counter;
-}
-
-template <class Impl>
-void
-InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
-{
-    DependencyEntry *prev = this;
-    DependencyEntry *curr = this->next;
-
-    // Make sure curr isn't NULL.  Because this instruction is being
-    // removed from a dependency list, it must have been placed there at
-    // an earlier time.  The dependency chain should not be empty,
-    // unless the instruction dependent upon it is already ready.
-    if (curr == NULL) {
-        return;
-    }
-
-    // Find the instruction to remove within the dependency linked list.
-    while (curr->inst != inst_to_remove) {
-        prev = curr;
-        curr = curr->next;
-
-        assert(curr != NULL);
-    }
-
-    // Now remove this instruction from the list.
-    prev->next = curr->next;
-
-    --mem_alloc_counter;
-
-    // Could push this off to the destructor of DependencyEntry
-    curr->inst = NULL;
-
-    delete curr;
-}
-
 template <class Impl>
 bool
 InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
@@ -1313,7 +1134,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                         "is being added to the dependency chain.\n",
                         new_inst->readPC(), src_reg);
 
-                dependGraph[src_reg].insert(new_inst);
+                dependGraph.insert(src_reg, new_inst);
 
                 // Change the return value to indicate that something
                 // was added to the dependency graph.
@@ -1323,7 +1144,7 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
                         "became ready before it reached the IQ.\n",
                         new_inst->readPC(), src_reg);
                 // Mark a register ready within the instruction.
-                new_inst->markSrcRegReady();
+                new_inst->markSrcRegReady(src_reg_idx);
             }
         }
     }
@@ -1333,12 +1154,12 @@ InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
+InstructionQueue<Impl>::addToProducers(DynInstPtr &new_inst)
 {
-    //Actually nothing really needs to be marked when an
-    //instruction becomes the producer of a register's value,
-    //but for convenience a ptr to the producing instruction will
-    //be placed in the head node of the dependency links.
+    // Nothing really needs to be marked when an instruction becomes
+    // the producer of a register's value, but for convenience a ptr
+    // to the producing instruction will be placed in the head node of
+    // the dependency links.
     int8_t total_dest_regs = new_inst->numDestRegs();
 
     for (int dest_reg_idx = 0;
@@ -1355,12 +1176,12 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
             continue;
         }
 
-        if (dependGraph[dest_reg].next) {
-            dumpDependGraph();
+        if (!dependGraph.empty(dest_reg)) {
+            dependGraph.dump();
             panic("Dependency graph %i not empty!", dest_reg);
         }
 
-        dependGraph[dest_reg].inst = new_inst;
+        dependGraph.setInst(dest_reg, new_inst);
 
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
@@ -1371,7 +1192,7 @@ template <class Impl>
 void
 InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 {
-    //If the instruction now has all of its source registers
+    // If the instruction now has all of its source registers
     // available, then add it to the list of ready instructions.
     if (inst->readyToIssue()) {
 
@@ -1382,7 +1203,6 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 
             // Message to the mem dependence unit that this instruction has
             // its registers ready.
-
             memDepUnit[inst->threadNumber].regsReady(inst);
 
             return;
@@ -1395,6 +1215,16 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
                 inst->readPC(), op_class, inst->seqNum);
 
         readyInsts[op_class].push(inst);
+
+        // Will need to reorder the list if either a queue is not on the list,
+        // or it has an older instruction than last time.
+        if (!queueOnList[op_class]) {
+            addToOrderList(op_class);
+        } else if (readyInsts[op_class].top()->seqNum  <
+                   (*readyIt[op_class]).oldestInst) {
+            listOrder.erase(readyIt[op_class]);
+            addToOrderList(op_class);
+        }
     }
 }
 
@@ -1434,34 +1264,6 @@ InstructionQueue<Impl>::countInsts()
 #endif
 }
 
-template <class Impl>
-void
-InstructionQueue<Impl>::dumpDependGraph()
-{
-    DependencyEntry *curr;
-
-    for (int i = 0; i < numPhysRegs; ++i)
-    {
-        curr = &dependGraph[i];
-
-        if (curr->inst) {
-            cprintf("dependGraph[%i]: producer: %#x [sn:%lli] consumer: ",
-                    i, curr->inst->readPC(), curr->inst->seqNum);
-        } else {
-            cprintf("dependGraph[%i]: No producer. consumer: ", i);
-        }
-
-        while (curr->next != NULL) {
-            curr = curr->next;
-
-            cprintf("%#x [sn:%lli] ",
-                    curr->inst->readPC(), curr->inst->seqNum);
-        }
-
-        cprintf("\n");
-    }
-}
-
 template <class Impl>
 void
 InstructionQueue<Impl>::dumpLists()
@@ -1524,8 +1326,8 @@ InstructionQueue<Impl>::dumpInsts()
                     cprintf("Count:%i\n", valid_num);
                 } else if ((*inst_list_it)->isMemRef() &&
                            !(*inst_list_it)->memOpDone) {
-                    // Loads that have not been marked as executed still count
-                    // towards the total instructions.
+                    // Loads that have not been marked as executed
+                    // still count towards the total instructions.
                     ++valid_num;
                     cprintf("Count:%i\n", valid_num);
                 }
-- 
cgit v1.2.3


From fda6ddbffdfb2dfecf233750c080191141450276 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:45:06 -0400
Subject: Rename function to be more expressive.

--HG--
extra : convert_revision : 0c01b6d5309e2d09f03631740c9b0c8619ea26c4
---
 cpu/o3/fu_pool.cc | 2 +-
 cpu/o3/fu_pool.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/fu_pool.cc b/cpu/o3/fu_pool.cc
index cb7a15061..fb2b5c00d 100644
--- a/cpu/o3/fu_pool.cc
+++ b/cpu/o3/fu_pool.cc
@@ -189,7 +189,7 @@ FUPool::getUnit(OpClass capability)
 }
 
 void
-FUPool::freeUnit(int fu_idx)
+FUPool::freeUnitNextCycle(int fu_idx)
 {
     assert(unitBusy[fu_idx]);
     unitsToBeFreed.push_back(fu_idx);
diff --git a/cpu/o3/fu_pool.hh b/cpu/o3/fu_pool.hh
index 7df5ad5f3..da6fdc802 100644
--- a/cpu/o3/fu_pool.hh
+++ b/cpu/o3/fu_pool.hh
@@ -134,7 +134,7 @@ class FUPool : public SimObject
     int getUnit(OpClass capability);
 
     /** Frees a FU at the end of this cycle. */
-    void freeUnit(int fu_idx);
+    void freeUnitNextCycle(int fu_idx);
 
     /** Frees all FUs on the list. */
     void processFreeUnits();
-- 
cgit v1.2.3


From 1a6f21b8d23494752cdc9d3a8d1c1a2adfd85ccf Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:47:55 -0400
Subject: Remove sat_counter.cc and put its code into sat_counter.hh.

cpu/SConscript:
    Remove sat_counter.cc and push its functions into the .hh file (all functions were 3 or less lines).
cpu/o3/sat_counter.hh:
    Incorporate .cc code into this file.

--HG--
extra : convert_revision : d75b1319292b00b00af1ce377cc0215fd06e6916
---
 cpu/o3/sat_counter.cc | 55 ---------------------------------------------------
 cpu/o3/sat_counter.hh | 22 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 60 deletions(-)
 delete mode 100644 cpu/o3/sat_counter.cc

(limited to 'cpu/o3')

diff --git a/cpu/o3/sat_counter.cc b/cpu/o3/sat_counter.cc
deleted file mode 100644
index b481b4ad2..000000000
--- a/cpu/o3/sat_counter.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2005 The Regents of The University of Michigan
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "base/misc.hh"
-#include "cpu/o3/sat_counter.hh"
-
-SatCounter::SatCounter()
-    : initialVal(0), counter(0)
-{
-}
-
-SatCounter::SatCounter(unsigned bits)
-    : initialVal(0), maxVal((1 << bits) - 1), counter(0)
-{
-}
-
-SatCounter::SatCounter(unsigned bits, uint8_t initial_val)
-    : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
-{
-    // Check to make sure initial value doesn't exceed the max counter value.
-    if (initial_val > maxVal) {
-        fatal("BP: Initial counter value exceeds max size.");
-    }
-}
-
-void
-SatCounter::setBits(unsigned bits)
-{
-    maxVal = (1 << bits) - 1;
-}
diff --git a/cpu/o3/sat_counter.hh b/cpu/o3/sat_counter.hh
index 1d20a8a8f..d01fd93ce 100644
--- a/cpu/o3/sat_counter.hh
+++ b/cpu/o3/sat_counter.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 The Regents of The University of Michigan
+ * Copyright (c) 2005-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,25 +44,37 @@ class SatCounter
     /**
      * Constructor for the counter.
      */
-    SatCounter();
+    SatCounter()
+        : initialVal(0), counter(0)
+    { }
 
     /**
      * Constructor for the counter.
      * @param bits How many bits the counter will have.
      */
-    SatCounter(unsigned bits);
+    SatCounter(unsigned bits)
+        : initialVal(0), maxVal((1 << bits) - 1), counter(0)
+    { }
 
     /**
      * Constructor for the counter.
      * @param bits How many bits the counter will have.
      * @param initial_val Starting value for each counter.
      */
-    SatCounter(unsigned bits, uint8_t initial_val);
+    SatCounter(unsigned bits, uint8_t initial_val)
+        : initialVal(initialVal), maxVal((1 << bits) - 1), counter(initial_val)
+    {
+        // Check to make sure initial value doesn't exceed the max
+        // counter value.
+        if (initial_val > maxVal) {
+            fatal("BP: Initial counter value exceeds max size.");
+        }
+    }
 
     /**
      * Sets the number of bits.
      */
-    void setBits(unsigned bits);
+    void setBits(unsigned bits) { maxVal = (1 << bits) - 1; }
 
     void reset() { counter = initialVal; }
 
-- 
cgit v1.2.3


From e3d5588ca70c88318c1e41e438102034c92c561e Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:53:17 -0400
Subject: O3 code update/cleanup.

cpu/o3/commit_impl.hh:
    O3 code update/cleanup.  Fetch fault code no longer needed (see previous checkin).

--HG--
extra : convert_revision : f602e7f978e19b8900dce482f38f9c7a195e94da
---
 cpu/o3/2bit_local_pred.cc   |   2 +-
 cpu/o3/2bit_local_pred.hh   |   2 +-
 cpu/o3/alpha_cpu.hh         |  18 +--
 cpu/o3/bpred_unit.cc        |   2 +-
 cpu/o3/bpred_unit.hh        |   7 +-
 cpu/o3/bpred_unit_impl.hh   |   6 +-
 cpu/o3/comm.hh              |   6 +-
 cpu/o3/commit.hh            |   5 +-
 cpu/o3/commit_impl.hh       |  59 +--------
 cpu/o3/decode.hh            |  12 +-
 cpu/o3/decode_impl.hh       |  14 +-
 cpu/o3/fetch.hh             |  31 ++---
 cpu/o3/fetch_impl.hh        |  29 +---
 cpu/o3/lsq.hh               |  65 ++++++---
 cpu/o3/lsq_impl.hh          | 138 +------------------
 cpu/o3/lsq_unit.hh          | 218 ++++++++----------------------
 cpu/o3/lsq_unit_impl.hh     | 317 +++++++++++++++-----------------------------
 cpu/o3/mem_dep_unit.hh      |   9 +-
 cpu/o3/mem_dep_unit_impl.hh |  20 +--
 cpu/o3/rename.hh            |  32 ++---
 cpu/o3/rename_impl.hh       |  35 ++---
 cpu/o3/rename_map.cc        |  81 +++--------
 cpu/o3/rename_map.hh        |   5 +-
 cpu/o3/rob.hh               |  34 +++--
 cpu/o3/rob_impl.hh          |  38 ++----
 cpu/o3/scoreboard.cc        |   1 +
 cpu/o3/store_set.cc         |   7 +-
 cpu/o3/thread_state.hh      |  95 +++++--------
 28 files changed, 381 insertions(+), 907 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index eab98531d..c3fb2fdb8 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 0dfe53819..cd65978ca 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index f70793aaa..78ad5f7d8 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -87,7 +87,8 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual Status status() const { return thread->status(); }
 
-        virtual void setStatus(Status new_status) { thread->setStatus(new_status); }
+        virtual void setStatus(Status new_status)
+        { thread->setStatus(new_status); }
 
         /// Set the status to Active.  Optional delay indicates number of
         /// cycles to wait before beginning execution.
@@ -168,12 +169,15 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
 
         // @todo: Figure out where these store cond failures should go.
-        virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
+        virtual unsigned readStCondFailures()
+        { return thread->storeCondFailures; }
 
-        virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
+        virtual void setStCondFailures(unsigned sc_failures)
+        { thread->storeCondFailures = sc_failures; }
 
 #if FULL_SYSTEM
-        virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
+        virtual bool inPalMode()
+        { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
 #endif
 
         // Only really makes sense for old CPU model.  Lots of code
@@ -194,10 +198,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
     };
 
-//    friend class AlphaXC;
-
-//    std::vector<ExecContext *> xcProxies;
-
 #if FULL_SYSTEM
     /** ITB pointer. */
     AlphaITB *itb;
diff --git a/cpu/o3/bpred_unit.cc b/cpu/o3/bpred_unit.cc
index a78dcf463..92344111f 100644
--- a/cpu/o3/bpred_unit.cc
+++ b/cpu/o3/bpred_unit.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index ee7ffc183..b7814b2e9 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -43,12 +43,7 @@
 
 /**
  * Basically a wrapper class to hold both the branch predictor
- * and the BTB.  Right now I'm unsure of the implementation; it would
- * be nicer to have something closer to the CPUPolicy or the Impl where
- * this is just typedefs, but it forces the upper level stages to be
- * aware of the constructors of the BP and the BTB.  The nicer thing
- * to do is have this templated on the Impl, accept the usual Params
- * object, and be able to call the constructors on the BP and BTB.
+ * and the BTB.
  */
 template<class Impl>
 class TwobitBPredUnit
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index d20b31e55..c37df606b 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <list>
+#include <vector>
+
 #include "base/trace.hh"
 #include "base/traceflags.hh"
 #include "cpu/o3/bpred_unit.hh"
 
-#include <vector>
-#include <list>
-
 using namespace std;
 
 template<class Impl>
diff --git a/cpu/o3/comm.hh b/cpu/o3/comm.hh
index 1a8f394ca..c36c58d3d 100644
--- a/cpu/o3/comm.hh
+++ b/cpu/o3/comm.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -169,10 +169,6 @@ struct TimeBufStruct {
         bool commitInsts;
         InstSeqNum squashSeqNum;
 
-        // Extra bit of information so that the LDSTQ only updates when it
-        // needs to.
-        bool commitIsLoad;
-
         // Communication specifically to the IQ to tell the IQ that it can
         // schedule a non-speculative instruction.
         InstSeqNum nonSpecSeqNum;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 73eccd2b0..66abf8dc6 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -30,10 +30,10 @@
 #define __CPU_O3_COMMIT_HH__
 
 #include "arch/faults.hh"
-#include "cpu/inst_seq.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/inst_seq.hh"
 #include "mem/memory_interface.hh"
 
 template <class>
@@ -59,8 +59,7 @@ class O3ThreadState;
  * squashing instruction's sequence number, and only broadcasting a
  * redirect if it corresponds to an older instruction. Commit also
  * supports multiple cycle squashing, to model a ROB that can only
- * remove a certain number of instructions per cycle. Eventually traps
- * and interrupts will most likely be handled here as well.
+ * remove a certain number of instructions per cycle.
  */
 template<class Impl>
 class DefaultCommit
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 170f5b01f..346a8bc1c 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -27,12 +27,7 @@
  */
 
 #include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <stdio.h>
-#include <string.h>
+#include <string>
 
 #include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
@@ -835,58 +830,6 @@ DefaultCommit<Impl>::commitInsts()
     unsigned num_committed = 0;
 
     DynInstPtr head_inst;
-#if FULL_SYSTEM
-    // Not the best way to check if the front end is empty, but it should
-    // work.
-    // @todo: Try to avoid directly accessing fetch.
-    if (commitStatus[0] == FetchTrapPending && rob->isEmpty()) {
-        DPRINTF(Commit, "Fault from fetch is pending.\n");
-
-        fetchTrapWait++;
-        if (fetchTrapWait > 10000000) {
-            panic("Fetch trap has been pending for a long time!");
-        }
-        if (fetchFaultTick > curTick) {
-            DPRINTF(Commit, "Not enough cycles since fault, fault will "
-                    "happen on %lli\n",
-                    fetchFaultTick);
-            cpu->activityThisCycle();
-            return;
-        } else if (iewStage->hasStoresToWB()) {
-            DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
-                    "they are completed. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            cpu->activityThisCycle();
-            return;
-        } else if (cpu->inPalMode(readPC())) {
-            DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        } else if (fetchStage->getYoungestSN() > youngestSeqNum[0]) {
-            DPRINTF(Commit, "Waiting for front end to drain. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        }
-        fetchTrapWait = 0;
-        DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
-
-        assert(!thread[0]->inSyscall);
-
-        thread[0]->inSyscall = true;
-
-        // Consider holding onto the trap and waiting until the trap event
-        // happens for this to be executed.
-        cpu->trap(fetchFault, 0);
-
-        // Exit state update mode to avoid accidental updating.
-        thread[0]->inSyscall = false;
-
-        commitStatus[0] = TrapPending;
-        // Set it up so that we squash next cycle
-        trapSquash[0] = true;
-        return;
-    }
-#endif
 
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 3f3f68247..3035b3387 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,11 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultDecode class handles both single threaded and SMT decode. Its width is
- * specified by the parameters; each cycles it tries to decode that many
- * instructions. Because instructions are actually decoded when the StaticInst
- * is created, this stage does not do much other than check any PC-relative
- * branches.
+ * DefaultDecode class handles both single threaded and SMT
+ * decode. Its width is specified by the parameters; each cycles it
+ * tries to decode that many instructions. Because instructions are
+ * actually decoded when the StaticInst is created, this stage does
+ * not do much other than check any PC-relative branches.
  */
 template<class Impl>
 class DefaultDecode
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index a419a8932..2ed7ec6fc 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ DefaultDecode<Impl>::DefaultDecode(Params *params)
       decodeWidth(params->decodeWidth),
       numThreads(params->numberOfThreads)
 {
-    DPRINTF(Decode, "decodeWidth=%i.\n", decodeWidth);
     _status = Inactive;
 
     for (int i = 0; i < numThreads; ++i) {
@@ -249,8 +248,6 @@ template<class Impl>
 bool
 DefaultDecode<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Decode, "[tid:%u]: Trying to unblock.\n", tid);
-
     // Decode is done unblocking only if the skid buffer is empty.
     if (skidBuffer[tid].empty()) {
         DPRINTF(Decode, "[tid:%u]: Done unblocking.\n", tid);
@@ -261,6 +258,8 @@ DefaultDecode<Impl>::unblock(unsigned tid)
         return true;
     }
 
+    DPRINTF(Decode, "[tid:%u]: Currently unblocking.\n", tid);
+
     return false;
 }
 
@@ -318,6 +317,7 @@ DefaultDecode<Impl>::squash(unsigned tid)
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
+        // @todo: Determine if this still happens.
         if (toFetch->decodeBlock[tid]) {
             toFetch->decodeBlock[tid] = 0;
         } else {
@@ -372,7 +372,7 @@ DefaultDecode<Impl>::skidInsert(unsigned tid)
         skidBuffer[tid].push(inst);
     }
 
-    // Eventually need to enforce this by not letting a thread
+    // @todo: Eventually need to enforce this by not letting a thread
     // fetch past its skidbuffer
     assert(skidBuffer[tid].size() <= skidBufferMax);
 }
@@ -436,10 +436,10 @@ void
 DefaultDecode<Impl>::sortInsts()
 {
     int insts_from_fetch = fromFetch->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_fetch; ++i) {
         insts[fromFetch->insts[i]->threadNumber].push(fromFetch->insts[i]);
     }
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index b03d4afe3..3fcfdc3a1 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,12 @@
 class Sampler;
 
 /**
- * DefaultFetch class handles both single threaded and SMT fetch. Its width is
- * specified by the parameters; each cycle it tries to fetch that many
- * instructions. It supports using a branch predictor to predict direction and
- * targets.
- * It supports the idling functionalitiy of the CPU by indicating to the CPU
- * when it is active and inactive.
+ * DefaultFetch class handles both single threaded and SMT fetch. Its
+ * width is specified by the parameters; each cycle it tries to fetch
+ * that many instructions. It supports using a branch predictor to
+ * predict direction and targets.
+ * It supports the idling functionalitiy of the CPU by indicating to
+ * the CPU when it is active and inactive.
  */
 template <class Impl>
 class DefaultFetch
@@ -66,8 +66,8 @@ class DefaultFetch
     typedef TheISA::ExtMachInst ExtMachInst;
 
   public:
-    /** Overall fetch status. Used to determine if the CPU can deschedule itsef
-     * due to a lack of activity.
+    /** Overall fetch status. Used to determine if the CPU can
+     * deschedule itsef due to a lack of activity.
      */
     enum FetchStatus {
         Active,
@@ -174,13 +174,13 @@ class DefaultFetch
     void wakeFromQuiesce();
 
   private:
-    /** Changes the status of this stage to active, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to active, and indicates this
+     * to the CPU.
      */
     inline void switchToActive();
 
-    /** Changes the status of this stage to inactive, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to inactive, and indicates
+     * this to the CPU.
      */
     inline void switchToInactive();
 
@@ -373,11 +373,6 @@ class DefaultFetch
 
     bool switchedOut;
 
-  public:
-    InstSeqNum &getYoungestSN() { return youngestSN; }
-  private:
-    InstSeqNum youngestSN;
-
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 523719945..1c5e508f6 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -938,10 +938,6 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to "
                 "decode.\n",tid);
 
-        //////////////////////////
-        // Fetch first instruction
-        //////////////////////////
-
         // Need to keep track of whether or not a predicted branch
         // ended this fetch block.
         bool predicted_branch = false;
@@ -1004,7 +1000,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+                warn("%lli: Quiesce instruction encountered, halting fetch!",
+                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1022,24 +1019,20 @@ DefaultFetch<Impl>::fetch(bool &status_change)
     // Now that fetching is completed, update the PC to signify what the next
     // cycle will be.
     if (fault == NoFault) {
-
         DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
 
-
         PC[tid] = next_PC;
         nextPC[tid] = next_PC + instSize;
     } else {
-        // If the issue was an icache miss, then we can just return and
-        // wait until it is handled.
+        // We shouldn't be in an icache miss and also have a fault (an ITB
+        // miss)
         if (fetchStatus[tid] == IcacheMissStall) {
             panic("Fetch should have exited prior to this!");
         }
 
-        // Handle the fault.
-        // This stage will not be able to continue until all the ROB
-        // slots are empty, at which point the fault can be handled.
-        // The only other way it can wake up is if a squash comes along
-        // and changes the PC.
+        // Send the fault to commit.  This thread will not do anything
+        // until commit handles the fault.  The only other way it can
+        // wake up is if a squash comes along and changes the PC.
 #if FULL_SYSTEM
         assert(numInst != fetchWidth);
         // Get a sequence number.
@@ -1067,20 +1060,12 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         toDecode->insts[numInst] = instruction;
         toDecode->size++;
 
-        // Tell the commit stage the fault we had.
-//        toDecode->fetchFault = fault;
-//        toDecode->fetchFaultSN = cpu->globalSeqNum;
-
         DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
         warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
-//        cpu->trap(fault);
-        // Send a signal to the ROB indicating that there's a trap from the
-        // fetch stage that needs to be handled.  Need to indicate that
-        // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index d5f893e57..a1eeccbe7 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,10 +32,9 @@
 #include <map>
 #include <queue>
 
-#include "base/hashmap.hh"
 #include "config/full_system.hh"
 #include "cpu/inst_seq.hh"
-#include "cpu/o3/cpu_policy.hh"
+//#include "cpu/o3/cpu_policy.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
@@ -85,7 +84,8 @@ class LSQ {
     /** Ticks the LSQ. */
     void tick();
     /** Ticks a specific LSQ Unit. */
-    void tick(unsigned tid);
+    void tick(unsigned tid)
+    { thread[tid].tick(); }
 
     /** Inserts a load into the LSQ. */
     void insertLoad(DynInstPtr &load_inst);
@@ -95,18 +95,23 @@ class LSQ {
     /** Executes a load. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx, unsigned tid);
+    Fault executeLoad(int lq_idx, unsigned tid)
+    { return thread[tid].executeLoad(lq_idx); }
+
     /** Executes a store. */
     Fault executeStore(DynInstPtr &inst);
 
     /**
      * Commits loads up until the given sequence number for a specific thread.
      */
-    void commitLoads(InstSeqNum &youngest_inst, unsigned tid);
+    void commitLoads(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitLoads(youngest_inst); }
+
     /**
      * Commits stores up until the given sequence number for a specific thread.
      */
-    void commitStores(InstSeqNum &youngest_inst, unsigned tid);
+    void commitStores(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitStores(youngest_inst); }
 
     /**
      * Attempts to write back stores until all cache ports are used or the
@@ -119,7 +124,8 @@ class LSQ {
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
-    void squash(const InstSeqNum &squashed_num, unsigned tid);
+    void squash(const InstSeqNum &squashed_num, unsigned tid)
+    { thread[tid].squash(squashed_num); }
 
     /** Returns whether or not there was a memory ordering violation. */
     bool violation();
@@ -127,12 +133,14 @@ class LSQ {
      * Returns whether or not there was a memory ordering violation for a
      * specific thread.
      */
-    bool violation(unsigned tid);
+    bool violation(unsigned tid)
+    { return thread[tid].violation(); }
 
     /** Returns if a load is blocked due to the memory system for a specific
      *  thread.
      */
-    bool loadBlocked(unsigned tid);
+    bool loadBlocked(unsigned tid)
+    { return thread[tid].loadBlocked(); }
 
     bool isLoadBlockedHandled(unsigned tid)
     { return thread[tid].isLoadBlockedHandled(); }
@@ -141,10 +149,13 @@ class LSQ {
     { thread[tid].setLoadBlockedHandled(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr getMemDepViolator(unsigned tid);
+    DynInstPtr getMemDepViolator(unsigned tid)
+    { return thread[tid].getMemDepViolator(); }
 
     /** Returns the head index of the load queue for a specific thread. */
-    int getLoadHead(unsigned tid);
+    int getLoadHead(unsigned tid)
+    { return thread[tid].getLoadHead(); }
+
     /** Returns the sequence number of the head of the load queue. */
     InstSeqNum getLoadHeadSeqNum(unsigned tid)
     {
@@ -152,7 +163,9 @@ class LSQ {
     }
 
     /** Returns the head index of the store queue. */
-    int getStoreHead(unsigned tid);
+    int getStoreHead(unsigned tid)
+    { return thread[tid].getStoreHead(); }
+
     /** Returns the sequence number of the head of the store queue. */
     InstSeqNum getStoreHeadSeqNum(unsigned tid)
     {
@@ -162,22 +175,26 @@ class LSQ {
     /** Returns the number of instructions in all of the queues. */
     int getCount();
     /** Returns the number of instructions in the queues of one thread. */
-    int getCount(unsigned tid);
+    int getCount(unsigned tid)
+    { return thread[tid].getCount(); }
 
     /** Returns the total number of loads in the load queue. */
     int numLoads();
     /** Returns the total number of loads for a single thread. */
-    int numLoads(unsigned tid);
+    int numLoads(unsigned tid)
+    { return thread[tid].numLoads(); }
 
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */
-    int numStores(unsigned tid);
+    int numStores(unsigned tid)
+    { return thread[tid].numStores(); }
 
     /** Returns the total number of loads that are ready. */
     int numLoadsReady();
     /** Returns the number of loads that are ready for a single thread. */
-    int numLoadsReady(unsigned tid);
+    int numLoadsReady(unsigned tid)
+    { return thread[tid].numLoadsReady(); }
 
     /** Returns the number of free entries. */
     unsigned numFreeEntries();
@@ -215,24 +232,30 @@ class LSQ {
 
     /** Returns whether or not there are any stores to write back to memory. */
     bool hasStoresToWB();
+
     /** Returns whether or not a specific thread has any stores to write back
      * to memory.
      */
-    bool hasStoresToWB(unsigned tid);
+    bool hasStoresToWB(unsigned tid)
+    { return thread[tid].hasStoresToWB(); }
+
     /** Returns the number of stores a specific thread has to write back. */
-    int  numStoresToWB(unsigned tid);
+    int  numStoresToWB(unsigned tid)
+    { return thread[tid].numStoresToWB(); }
 
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
      * cycle.
      */
-    bool willWB(unsigned tid);
+    bool willWB(unsigned tid)
+    { return thread[tid].willWB(); }
 
     /** Debugging function to print out all instructions. */
     void dumpInsts();
     /** Debugging function to print out instructions from a specific thread. */
-    void dumpInsts(unsigned tid);
+    void dumpInsts(unsigned tid)
+    { thread[tid].dumpInsts(); }
 
     /** Executes a read operation, using the load specified at the load index. */
     template <class T>
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index c43c19619..a6ad27522 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <algorithm>
+#include <string>
+
 #include "cpu/o3/lsq.hh"
 
 using namespace std;
@@ -89,7 +92,7 @@ LSQ<Impl>::LSQ(Params *params)
 
     //Initialize LSQs
     for (int tid=0; tid < numThreads; tid++) {
-        thread[tid].init(params, maxLQEntries+1, maxSQEntries+1, tid);
+        thread[tid].init(params, maxLQEntries, maxSQEntries, tid);
     }
 }
 
@@ -226,13 +229,6 @@ LSQ<Impl>::tick()
     }
 }
 
-template<class Impl>
-void
-LSQ<Impl>::tick(unsigned tid)
-{
-    thread[tid].tick();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::insertLoad(DynInstPtr &load_inst)
@@ -260,13 +256,6 @@ LSQ<Impl>::executeLoad(DynInstPtr &inst)
     return thread[tid].executeLoad(inst);
 }
 
-template<class Impl>
-Fault
-LSQ<Impl>::executeLoad(int lq_idx, unsigned tid)
-{
-    return thread[tid].executeLoad(lq_idx);
-}
-
 template<class Impl>
 Fault
 LSQ<Impl>::executeStore(DynInstPtr &inst)
@@ -276,20 +265,6 @@ LSQ<Impl>::executeStore(DynInstPtr &inst)
     return thread[tid].executeStore(inst);
 }
 
-template<class Impl>
-void
-LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitLoads(youngest_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::commitStores(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitStores(youngest_inst);
-}
-
 template<class Impl>
 void
 LSQ<Impl>::writebackStores()
@@ -300,28 +275,14 @@ LSQ<Impl>::writebackStores()
         unsigned tid = *active_threads++;
 
         if (numStoresToWB(tid) > 0) {
-            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores available"
-                " for Writeback.\n", tid, numStoresToWB(tid));
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores "
+                "available for Writeback.\n", tid, numStoresToWB(tid));
         }
 
         thread[tid].writebackStores();
     }
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStoresToWB(unsigned tid)
-{
-    return thread[tid].numStoresToWB();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::squash(const InstSeqNum &squashed_num, unsigned tid)
-{
-        thread[tid].squash(squashed_num);
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::violation()
@@ -338,41 +299,6 @@ LSQ<Impl>::violation()
     return false;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::violation(unsigned tid)
-{
-    return thread[tid].violation();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::loadBlocked(unsigned tid)
-{
-    return thread[tid].loadBlocked();
-}
-
-template<class Impl>
-typename Impl::DynInstPtr
-LSQ<Impl>::getMemDepViolator(unsigned tid)
-{
-    return thread[tid].getMemDepViolator();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getLoadHead(unsigned tid)
-{
-    return thread[tid].getLoadHead();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getStoreHead(unsigned tid)
-{
-    return thread[tid].getStoreHead();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::getCount()
@@ -389,13 +315,6 @@ LSQ<Impl>::getCount()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::getCount(unsigned tid)
-{
-    return thread[tid].getCount();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoads()
@@ -412,13 +331,6 @@ LSQ<Impl>::numLoads()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoads(unsigned tid)
-{
-    return thread[tid].numLoads();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numStores()
@@ -435,13 +347,6 @@ LSQ<Impl>::numStores()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStores(unsigned tid)
-{
-    return thread[tid].numStores();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoadsReady()
@@ -458,13 +363,6 @@ LSQ<Impl>::numLoadsReady()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoadsReady(unsigned tid)
-{
-    return thread[tid].numLoadsReady();
-}
-
 template<class Impl>
 unsigned
 LSQ<Impl>::numFreeEntries()
@@ -612,14 +510,6 @@ LSQ<Impl>::hasStoresToWB()
     return true;
 }
 
-
-template<class Impl>
-bool
-LSQ<Impl>::hasStoresToWB(unsigned tid)
-{
-    return thread[tid].hasStoresToWB();
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::willWB()
@@ -635,13 +525,6 @@ LSQ<Impl>::willWB()
     return true;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::willWB(unsigned tid)
-{
-    return thread[tid].willWB();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::dumpInsts()
@@ -653,10 +536,3 @@ LSQ<Impl>::dumpInsts()
         thread[tid].dumpInsts();
     }
 }
-
-template<class Impl>
-void
-LSQ<Impl>::dumpInsts(unsigned tid)
-{
-    thread[tid].dumpInsts();
-}
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index 623dbdb4b..942b4583d 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,30 @@
 #ifndef __CPU_O3_LSQ_UNIT_HH__
 #define __CPU_O3_LSQ_UNIT_HH__
 
+#include <algorithm>
 #include <map>
 #include <queue>
-#include <algorithm>
 
+#include "arch/faults.hh"
 #include "config/full_system.hh"
 #include "base/hashmap.hh"
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
-#include "sim/debug.hh"
-#include "sim/sim_object.hh"
-#include "arch/faults.hh"
+//#include "sim/debug.hh"
+//#include "sim/sim_object.hh"
 
 /**
- * Class that implements the actual LQ and SQ for each specific thread.
- * Both are circular queues; load entries are freed upon committing, while
- * store entries are freed once they writeback. The LSQUnit tracks if there
- * are memory ordering violations, and also detects partial load to store
- * forwarding cases (a store only has part of a load's data) that requires
- * the load to wait until the store writes back. In the former case it
- * holds onto the instruction until the dependence unit looks at it, and
- * in the latter it stalls the LSQ until the store writes back. At that
- * point the load is replayed.
+ * Class that implements the actual LQ and SQ for each specific
+ * thread.  Both are circular queues; load entries are freed upon
+ * committing, while store entries are freed once they writeback. The
+ * LSQUnit tracks if there are memory ordering violations, and also
+ * detects partial load to store forwarding cases (a store only has
+ * part of a load's data) that requires the load to wait until the
+ * store writes back. In the former case it holds onto the instruction
+ * until the dependence unit looks at it, and in the latter it stalls
+ * the LSQ until the store writes back. At that point the load is
+ * replayed.
  */
 template <class Impl>
 class LSQUnit {
@@ -76,21 +77,19 @@ class LSQUnit {
         /** Returns the description of this event. */
         const char *description();
 
-      private:
-        /** The store index of the store being written back. */
-        int storeIdx;
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
-      public:
         Event *wbEvent;
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
       private:
         /** The pointer to the LSQ unit that issued the store. */
         LSQUnit<Impl> *lsqPtr;
     };
 
-    friend class StoreCompletionEvent;
-
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit();
@@ -136,14 +135,12 @@ class LSQUnit {
     /** Executes a load instruction. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx);
+    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
     /** Executes a store instruction. */
     Fault executeStore(DynInstPtr &inst);
 
     /** Commits the head load. */
     void commitLoad();
-    /** Commits a specific load, given by the sequence number. */
-    void commitLoad(InstSeqNum &inst);
     /** Commits loads older than a specific sequence number. */
     void commitLoads(InstSeqNum &youngest_inst);
 
@@ -179,9 +176,7 @@ class LSQUnit {
     /** Returns the memory ordering violator. */
     DynInstPtr getMemDepViolator();
 
-    /** Returns if a load became blocked due to the memory system.  It clears
-     *  the bool's value upon this being called.
-     */
+    /** Returns if a load became blocked due to the memory system. */
     bool loadBlocked()
     { return isLoadBlocked; }
 
@@ -215,9 +210,6 @@ class LSQUnit {
     /** Returns if the SQ is full. */
     bool sqFull() { return stores >= (SQEntries - 1); }
 
-    /** Debugging function to dump instructions in the LSQ. */
-    void dumpInsts();
-
     /** Returns the number of instructions in the LSQ. */
     unsigned getCount() { return loads + stores; }
 
@@ -245,6 +237,10 @@ class LSQUnit {
     /** Decrements the given load index (circular queue). */
     inline void decrLdIdx(int &load_idx);
 
+  public:
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
   private:
     /** Pointer to the CPU. */
     FullCPU *cpu;
@@ -287,38 +283,29 @@ class LSQUnit {
         /** Whether or not the store is completed. */
         bool completed;
     };
-/*
-    enum Status {
-        Running,
-        Idle,
-        DcacheMissStall,
-        DcacheMissSwitch
-    };
-*/
+
   private:
     /** The LSQUnit thread id. */
     unsigned lsqID;
 
-    /** The status of the LSQ unit. */
-//    Status _status;
-
     /** The store queue. */
     std::vector<SQEntry> storeQueue;
 
     /** The load queue. */
     std::vector<DynInstPtr> loadQueue;
 
-    // Consider making these 16 bits
-    /** The number of LQ entries. */
+    /** The number of LQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of LQ entries.
+     */
     unsigned LQEntries;
-    /** The number of SQ entries. */
+    /** The number of SQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of SQ entries.
+     */
     unsigned SQEntries;
 
     /** The number of load instructions in the LQ. */
     int loads;
-    /** The number of store instructions in the SQ (excludes those waiting to
-     * writeback).
-     */
+    /** The number of store instructions in the SQ. */
     int stores;
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
@@ -330,8 +317,8 @@ class LSQUnit {
 
     /** The index of the head instruction in the SQ. */
     int storeHead;
-    /** The index of the first instruction that is ready to be written back,
-     * and has not yet been written back.
+    /** The index of the first instruction that may be ready to be
+     * written back, and has not yet been written back.
      */
     int storeWBIdx;
     /** The index of the tail instruction in the SQ. */
@@ -348,13 +335,9 @@ class LSQUnit {
 
     //list<InstSeqNum> mshrSeqNums;
 
-     //Stats::Scalar<> dcacheStallCycles;
-    Counter lastDcacheStall;
-
     /** Wire to read information from the issue stage time queue. */
     typename TimeBuffer<IssueStruct>::wire fromIssue;
 
-    // Make these per thread?
     /** Whether or not the LSQ is stalled. */
     bool stalled;
     /** The store that causes the stall due to partial store to load
@@ -364,20 +347,13 @@ class LSQUnit {
     /** The index of the above store. */
     int stallingLoadIdx;
 
-    /** Whether or not a load is blocked due to the memory system.  It is
-     *  cleared when this value is checked via loadBlocked().
-     */
+    /** Whether or not a load is blocked due to the memory system. */
     bool isLoadBlocked;
 
     bool loadBlockedHandled;
 
     InstSeqNum blockedLoadSeqNum;
 
-    /** The oldest faulting load instruction. */
-    DynInstPtr loadFaultInst;
-    /** The oldest faulting store instruction. */
-    DynInstPtr storeFaultInst;
-
     /** The oldest load that caused a memory ordering violation. */
     DynInstPtr memDepViolator;
 
@@ -447,23 +423,14 @@ template <class T>
 Fault
 LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 {
-    //Depending on issue2execute delay a squashed load could
-    //execute if it is found to be squashed in the same
-    //cycle it is scheduled to execute
     assert(loadQueue[load_idx]);
 
-    if (loadQueue[load_idx]->isExecuted()) {
-        panic("Should not reach this point with split ops!");
-        memcpy(&data,req->data,req->size);
-
-        return NoFault;
-    }
+    assert(!loadQueue[load_idx]->isExecuted());
 
     // Make sure this isn't an uncacheable access
     // A bit of a hackish way to get uncached accesses to work only if they're
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
-    // @todo: Fix uncached accesses.
     if (req->flags & UNCACHEABLE &&
         (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
         iewStage->rescheduleMemInst(loadQueue[load_idx]);
@@ -479,12 +446,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             "storeHead: %i addr: %#x\n",
             load_idx, store_idx, storeHead, req->paddr);
 
-#ifdef FULL_SYSTEM
+#if 0
     if (req->flags & LOCKED) {
         cpu->lockAddr = req->paddr;
         cpu->lockFlag = true;
     }
 #endif
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
 
     while (store_idx != -1) {
         // End once we've reached the top of the LSQ
@@ -518,18 +489,14 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         // If the store's data has all of the data needed, we can forward.
         if (store_has_lower_limit && store_has_upper_limit) {
-
+            // Get shift amount for offset into the store's data.
             int shift_amt = req->vaddr & (store_size - 1);
-            // Assumes byte addressing
+            // @todo: Magic number, assumes byte addressing
             shift_amt = shift_amt << 3;
 
             // Cast this to type T?
             data = storeQueue[store_idx].data >> shift_amt;
 
-            req->cmd = Read;
-            assert(!req->completionEvent);
-            req->completionEvent = NULL;
-            req->time = curTick;
             assert(!req->data);
             req->data = new uint8_t[64];
 
@@ -579,7 +546,6 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             // Do not generate a writeback event as this instruction is not
             // complete.
-
             DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
                     "Store idx %i to load addr %#x\n",
                     store_idx, req->vaddr);
@@ -588,16 +554,13 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
     }
 
-
     // If there's no forwarding case, then go access memory
     DynInstPtr inst = loadQueue[load_idx];
 
-    DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
-            loadQueue[load_idx]->readPC());
+    DPRINTF(LSQUnit, "Doing functional access for inst [sn:%lli] PC %#x\n",
+            loadQueue[load_idx]->seqNum, loadQueue[load_idx]->readPC());
+
     assert(!req->data);
-    req->cmd = Read;
-    req->completionEvent = NULL;
-    req->time = curTick;
     req->data = new uint8_t[64];
     Fault fault = cpu->read(req, data);
     memcpy(req->data, &data, sizeof(T));
@@ -611,20 +574,19 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
                 return NoFault;
 
+            // Record that the load was blocked due to memory.  This
+            // load will squash all instructions after it, be
+            // refetched, and re-executed.
             isLoadBlocked = true;
             loadBlockedHandled = false;
             blockedLoadSeqNum = inst->seqNum;
             // No fault occurred, even though the interface is blocked.
             return NoFault;
         }
+
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-/*
-        Addr debug_addr = ULL(0xfffffc0000be81a8);
-        if (req->vaddr == debug_addr) {
-            debug_break();
-        }
-*/
+
         assert(!req->completionEvent);
         req->completionEvent =
             new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
@@ -632,75 +594,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         assert(dcacheInterface->doEvents());
 
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
         if (result != MA_HIT) {
             DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
             DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
                     inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-//            _status = DcacheMissStall;
-
         } else {
-            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                    inst->seqNum);
-
             DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        }
-    }
-#if 0
-    // if we have a cache, do cache access too
-    if (dcacheInterface) {
-        if (dcacheInterface->isBlocked()) {
-            isLoadBlocked = true;
-            // No fault occurred, even though the interface is blocked.
-            return NoFault;
-        }
-
-        DPRINTF(LSQUnit, "LSQUnit: D-cache: PC:%#x reading from paddr:%#x "
-                "vaddr:%#x flags:%i\n",
-                inst->readPC(), req->paddr, req->vaddr, req->flags);
-
-        // Setup MemReq pointer
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
-        assert(!req->data);
-        req->data = new uint8_t[64];
-
-        assert(!req->completionEvent);
-        req->completionEvent =
-            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
-
-        // Do Cache Access
-        MemAccessResult result = dcacheInterface->access(req);
-
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
-        // @todo: Probably should support having no events
-        if (result != MA_HIT) {
-            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                    inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-            _status = DcacheMissStall;
-
-        } else {
             DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
                     inst->seqNum);
-
-            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
         }
-    } else {
-        fatal("Must use D-cache with new memory system");
     }
-#endif
 
     return fault;
 }
@@ -716,24 +619,11 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
             " | storeHead:%i [sn:%i]\n",
             store_idx, req->paddr, data, storeHead,
             storeQueue[store_idx].inst->seqNum);
-/*
-    if (req->flags & LOCKED) {
-        if (req->flags & UNCACHEABLE) {
-            req->result = 2;
-        } else {
-            req->result = 1;
-        }
-    }
-*/
+
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
     storeQueue[store_idx].data = data;
-/*
-    Addr debug_addr = ULL(0xfffffc0000be81a8);
-    if (req->vaddr == debug_addr) {
-        debug_break();
-    }
-*/
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index dca808ac9..f0b4405ed 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -35,8 +35,8 @@ LSQUnit<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
                                                           Event *wb_event,
                                                           LSQUnit<Impl> *lsq_ptr)
     : Event(&mainEventQueue),
-      storeIdx(store_idx),
       wbEvent(wb_event),
+      storeIdx(store_idx),
       lsqPtr(lsq_ptr)
 {
     this->setFlags(Event::AutoDelete);
@@ -86,15 +86,13 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     lsqID = id;
 
-    LQEntries = maxLQEntries;
-    SQEntries = maxSQEntries;
+    // Add 1 for the sentinel entry (they are circular queues).
+    LQEntries = maxLQEntries + 1;
+    SQEntries = maxSQEntries + 1;
 
     loadQueue.resize(LQEntries);
     storeQueue.resize(SQEntries);
 
-
-    // May want to initialize these entries to NULL
-
     loadHead = loadTail = 0;
 
     storeHead = storeWBIdx = storeTail = 0;
@@ -104,7 +102,7 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     dcacheInterface = params->dcacheInterface;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 }
@@ -152,6 +150,8 @@ LSQUnit<Impl>::switchOut()
     for (int i = 0; i < loadQueue.size(); ++i)
         loadQueue[i] = NULL;
 
+    assert(storesToWB == 0);
+
     while (storesToWB > 0 &&
            storeWBIdx != storeTail &&
            storeQueue[storeWBIdx].inst &&
@@ -218,7 +218,7 @@ LSQUnit<Impl>::takeOverFrom()
 
     usedPorts = 0;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 
@@ -231,16 +231,17 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeLQ(unsigned size)
 {
-    assert( size >= LQEntries);
+    unsigned size_plus_sentinel = size + 1;
+    assert(size_plus_sentinel >= LQEntries);
 
-    if (size > LQEntries) {
-        while (size > loadQueue.size()) {
+    if (size_plus_sentinel > LQEntries) {
+        while (size_plus_sentinel > loadQueue.size()) {
             DynInstPtr dummy;
             loadQueue.push_back(dummy);
             LQEntries++;
         }
     } else {
-        LQEntries = size;
+        LQEntries = size_plus_sentinel;
     }
 
 }
@@ -249,14 +250,15 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeSQ(unsigned size)
 {
-    if (size > SQEntries) {
-        while (size > storeQueue.size()) {
+    unsigned size_plus_sentinel = size + 1;
+    if (size_plus_sentinel > SQEntries) {
+        while (size_plus_sentinel > storeQueue.size()) {
             SQEntry dummy;
             storeQueue.push_back(dummy);
             SQEntries++;
         }
     } else {
-        SQEntries = size;
+        SQEntries = size_plus_sentinel;
     }
 }
 
@@ -264,10 +266,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insert(DynInstPtr &inst)
 {
-    // Make sure we really have a memory reference.
     assert(inst->isMemRef());
 
-    // Make sure it's one of the two classes of memory references.
     assert(inst->isLoad() || inst->isStore());
 
     if (inst->isLoad()) {
@@ -283,7 +283,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
 {
-    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+    assert((loadTail + 1) % LQEntries != loadHead);
+    assert(loads < LQEntries);
 
     DPRINTF(LSQUnit, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
             load_inst->readPC(), loadTail, load_inst->seqNum);
@@ -322,7 +323,6 @@ LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
     incrStIdx(storeTail);
 
     ++stores;
-
 }
 
 template <class Impl>
@@ -370,39 +370,6 @@ LSQUnit<Impl>::numLoadsReady()
     return retval;
 }
 
-#if 0
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad()
-{
-    Fault load_fault = NoFault;
-    DynInstPtr load_inst;
-
-    assert(readyLoads.size() != 0);
-
-    // Execute a ready load.
-    LdMapIt ready_it = readyLoads.begin();
-
-    load_inst = (*ready_it).second;
-
-    // Execute the instruction, which is held in the data portion of the
-    // iterator.
-    load_fault = load_inst->execute();
-
-    // If it executed successfully, then switch it over to the executed
-    // loads list.
-    if (load_fault == NoFault) {
-        executedLoads[load_inst->seqNum] = load_inst;
-
-        readyLoads.erase(ready_it);
-    } else {
-        loadFaultInst = load_inst;
-    }
-
-    return load_fault;
-}
-#endif
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
@@ -413,33 +380,14 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
             inst->readPC(),inst->seqNum);
 
-    // Make sure it's really in the list.
-    // Normally it should always be in the list.  However,
-    /* due to a syscall it may not be the list.
-#ifdef DEBUG
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail && !find(inst)) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i] == inst) {
-            break;
-        }
-
-        i = i + 1;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-#endif // DEBUG*/
-
 //    load_fault = inst->initiateAcc();
     load_fault = inst->execute();
 
     // If the instruction faulted, then we need to send it along to commit
     // without the instruction completing.
     if (load_fault != NoFault) {
-        // Maybe just set it as can commit here, although that might cause
-        // some other problems with sending traps to the ROB too quickly.
+        // Send this instruction to commit, also make sure iew stage
+        // realizes there is activity.
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     }
@@ -447,20 +395,6 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     return load_fault;
 }
 
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad(int lq_idx)
-{
-    // Very hackish.  Not sure the best way to check that this
-    // instruction is at the head of the ROB.  I should have some sort
-    // of extra information here so that I'm not overloading the
-    // canCommit signal for 15 different things.
-    loadQueue[lq_idx]->setCanCommit();
-    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
-    loadQueue[lq_idx]->clearCanCommit();
-    return ret_fault;
-}
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
@@ -481,11 +415,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
     Fault store_fault = store_inst->initiateAcc();
 //    Fault store_fault = store_inst->execute();
 
-    // Store size should now be available.  Use it to get proper offset for
-    // addr comparisons.
-    int size = storeQueue[store_idx].size;
-
-    if (size == 0) {
+    if (storeQueue[store_idx].size == 0) {
         DPRINTF(LSQUnit,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
                 store_inst->readPC(),store_inst->seqNum);
 
@@ -494,30 +424,25 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (!storeFaultInst) {
-        if (store_fault != NoFault) {
-            panic("Fault in a store instruction!");
-            storeFaultInst = store_inst;
-        } else if (store_inst->isNonSpeculative()) {
-            // Nonspeculative accesses (namely store conditionals)
-            // need to set themselves as able to writeback if we
-            // haven't had a fault by here.
-            storeQueue[store_idx].canWB = true;
+    if (store_inst->isNonSpeculative()) {
+        // Nonspeculative accesses (namely store conditionals)
+        // need to set themselves as able to writeback if we
+        // haven't had a fault by here.
+        storeQueue[store_idx].canWB = true;
 
-            ++storesToWB;
-        }
+        ++storesToWB;
     }
 
     if (!memDepViolator) {
         while (load_idx != loadTail) {
-            // Actually should only check loads that have actually executed
-            // Might be safe because effAddr is set to InvalAddr when the
-            // dyn inst is created.
-
-            // Must actually check all addrs in the proper size range
-            // Which is more correct than needs to be.  What if for now we just
-            // assume all loads are quad-word loads, and do the addr based
-            // on that.
+            // Really only need to check loads that have actually executed
+            // It's safe to check all loads because effAddr is set to
+            // InvalAddr when the dyn inst is created.
+
+            // @todo: For now this is extra conservative, detecting a
+            // violation if the addresses match assuming all accesses
+            // are quad word accesses.
+
             // @todo: Fix this, magic number being used here
             if ((loadQueue[load_idx]->effAddr >> 8) ==
                 (store_inst->effAddr >> 8)) {
@@ -555,32 +480,6 @@ LSQUnit<Impl>::commitLoad()
     --loads;
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::commitLoad(InstSeqNum &inst)
-{
-    // Hopefully I don't use this function too much
-    panic("Don't use this function!");
-
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i]->seqNum == inst) {
-            break;
-        }
-
-        ++i;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-
-    loadQueue[i]->removeInLSQ();
-    loadQueue[i] = NULL;
-    --loads;
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
@@ -602,6 +501,8 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
     while (store_idx != storeTail) {
         assert(storeQueue[store_idx].inst);
+        // Mark any stores that are now committed and have not yet
+        // been marked as able to write back.
         if (!storeQueue[store_idx].canWB) {
             if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
                 break;
@@ -613,7 +514,6 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
             storeQueue[store_idx].canWB = true;
 
-//            --stores;
             ++storesToWB;
         }
 
@@ -631,6 +531,8 @@ LSQUnit<Impl>::writebackStores()
            storeQueue[storeWBIdx].canWB &&
            usedPorts < cachePorts) {
 
+        // Store didn't write any data so no need to write it back to
+        // memory.
         if (storeQueue[storeWBIdx].size == 0) {
             completeStore(storeWBIdx);
 
@@ -659,7 +561,6 @@ LSQUnit<Impl>::writebackStores()
         MemReqPtr req = storeQueue[storeWBIdx].req;
         storeQueue[storeWBIdx].committed = true;
 
-//	Fault fault = cpu->translateDataWriteReq(req);
         req->cmd = Write;
         req->completionEvent = NULL;
         req->time = curTick;
@@ -689,6 +590,12 @@ LSQUnit<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+
+        // Stores other than store conditionals are completed at this
+        // time.  Mark them as completed and, if we have a checker,
+        // tell it that the instruction is completed.
+        // @todo: Figure out what time I can say stores are complete in
+        // the timing memory.
         if (!(req->flags & LOCKED)) {
             storeQueue[storeWBIdx].inst->setCompleted();
             if (cpu->checker) {
@@ -714,57 +621,35 @@ LSQUnit<Impl>::writebackStores()
                 iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
             }
 
-            if (result != MA_HIT && dcacheInterface->doEvents()) {
-                typename IEW::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
-                    // Stx_C should not generate a system port transaction,
-                    // but that might be hard to accomplish.
-                    wb = new typename
-                        IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+            typename IEW::LdWritebackEvent *wb = NULL;
+            if (req->flags & LOCKED) {
+                // Stx_C should not generate a system port transaction
+                // if it misses in the cache, but that might be hard
+                // to accomplish without explicit cache support.
+                wb = new typename
+                    IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
                                               iewStage);
-                    store_event->wbEvent = wb;
-                }
+                store_event->wbEvent = wb;
+            }
 
-                DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                DPRINTF(LSQUnit,"D-Cache Write Miss on idx:%i!\n",
+                        storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
 
-                lastDcacheStall = curTick;
-
-//                _status = DcacheMissStall;
-
                 //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
 
                 //DPRINTF(LSQUnit, "Added MSHR. count = %i\n",mshrSeqNums.size());
 
-                // Increment stat here or something
+                // @todo: Increment stat here.
             } else {
                 DPRINTF(LSQUnit,"D-Cache: Write Hit on idx:%i !\n",
                         storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
-
-
-                if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*
-                    if (req->flags & UNCACHEABLE) {
-                        req->result = 2;
-                    } else {
-                        if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
-                            req->result=1;
-                        } else {
-                            req->result = 0;
-                        }
-                    }
-*/
-                    typename IEW::LdWritebackEvent *wb =
-                        new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
-                                                           iewStage);
-                    store_event->wbEvent = wb;
-                }
             }
 
             incrStIdx(storeWBIdx);
@@ -798,14 +683,12 @@ void
 LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 {
     DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
-            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+            "(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
 
     int load_idx = loadTail;
     decrLdIdx(load_idx);
 
     while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
-
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Load Instruction PC %#x squashed, "
                 "[sn:%lli]\n",
                 loadQueue[load_idx]->readPC(),
@@ -817,6 +700,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingLoadIdx = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         loadQueue[load_idx]->squashed = true;
         loadQueue[load_idx] = NULL;
         --loads;
@@ -840,19 +724,18 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 
     while (stores != 0 &&
            storeQueue[store_idx].inst->seqNum > squashed_num) {
-
+        // Instructions marked as can WB are already committed.
         if (storeQueue[store_idx].canWB) {
             break;
         }
 
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Store Instruction PC %#x squashed, "
                 "idx:%i [sn:%lli]\n",
                 storeQueue[store_idx].inst->readPC(),
                 store_idx, storeQueue[store_idx].inst->seqNum);
 
-        // I don't think this can happen.  It should have been cleared by the
-        // stalling load.
+        // I don't think this can happen.  It should have been cleared
+        // by the stalling load.
         if (isStalled() &&
             storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
             panic("Is stalled should have been cleared by stalling load!\n");
@@ -860,13 +743,17 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingStoreIsn = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         storeQueue[store_idx].inst->squashed = true;
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
         if (storeQueue[store_idx].req) {
+            // There should not be a completion event if the store has
+            // not yet committed.
             assert(!storeQueue[store_idx].req->completionEvent);
         }
+
         storeQueue[store_idx].req = NULL;
         --stores;
 
@@ -877,36 +764,6 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
     }
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::dumpInsts()
-{
-    cprintf("Load store queue: Dumping instructions.\n");
-    cprintf("Load queue size: %i\n", loads);
-    cprintf("Load queue: ");
-
-    int load_idx = loadHead;
-
-    while (load_idx != loadTail && loadQueue[load_idx]) {
-        cprintf("%#x ", loadQueue[load_idx]->readPC());
-
-        incrLdIdx(load_idx);
-    }
-
-    cprintf("Store queue size: %i\n", stores);
-    cprintf("Store queue: ");
-
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail && storeQueue[store_idx].inst) {
-        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
-
-        incrStIdx(store_idx);
-    }
-
-    cprintf("\n");
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::completeStore(int store_idx)
@@ -930,7 +787,9 @@ LSQUnit<Impl>::completeStore(int store_idx)
         iewStage->updateLSQNextCycle = true;
     }
 
-    DPRINTF(LSQUnit, "Store head idx:%i\n", storeHead);
+    DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
+            "idx:%i\n",
+            storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
 
     if (isStalled() &&
         storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
@@ -943,6 +802,10 @@ LSQUnit<Impl>::completeStore(int store_idx)
     }
 
     storeQueue[store_idx].inst->setCompleted();
+
+    // Tell the checker we've completed this instruction.  Some stores
+    // may get reported twice to the checker, but the checker can
+    // handle that case.
     if (cpu->checker) {
         cpu->checker->tick(storeQueue[store_idx].inst);
     }
@@ -979,3 +842,33 @@ LSQUnit<Impl>::decrLdIdx(int &load_idx)
     if (--load_idx < 0)
         load_idx += LQEntries;
 }
+
+template <class Impl>
+void
+LSQUnit<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("%#x ", loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("Store queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index 141e0fdc4..acbe08ec2 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,13 +201,6 @@ class MemDepUnit {
         static int memdep_erase;
     };
 
-    struct ltMemDepEntry {
-        bool operator() (const MemDepEntryPtr &lhs, const MemDepEntryPtr &rhs)
-        {
-            return lhs->inst->seqNum < rhs->inst->seqNum;
-        }
-    };
-
     /** Finds the memory dependence entry in the hash map. */
     inline MemDepEntryPtr &findInHash(const DynInstPtr &inst);
 
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 05a33685d..8b195baab 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -141,12 +141,12 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
     MemDepEntry::memdep_insert++;
 
-    // Add the instruction to the instruction list.
     instList[tid].push_back(inst);
 
     inst_entry->listIt = --(instList[tid].end());
 
-    // Check the dependence predictor for any producing stores.
+    // Check any barriers and the dependence predictor for any
+    // producing stores.
     InstSeqNum producing_store;
     if (inst->isLoad() && loadBarrier) {
         producing_store = loadBarrierSN;
@@ -181,7 +181,7 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
             moveToReady(inst_entry);
         }
     } else {
-        // Otherwise make the instruction dependent on the store.
+        // Otherwise make the instruction dependent on the store/barrier.
         DPRINTF(MemDepUnit, "Adding to dependency list; "
                 "inst PC %#x is dependent on [sn:%lli].\n",
                 inst->readPC(), producing_store);
@@ -193,8 +193,6 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         // Add this instruction to the list of dependents.
         store_entry->dependInsts.push_back(inst_entry);
 
-//        inst_entry->producingStore = store_entry;
-
         if (inst->isLoad()) {
             ++conflictingLoads;
         } else {
@@ -370,8 +368,6 @@ MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
 
     instList[tid].erase((*hash_it).second->listIt);
 
-//    (*hash_it).second->inst = NULL;
-
     (*hash_it).second = NULL;
 
     memDepHash.erase(hash_it);
@@ -416,7 +412,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 
         if (!woken_inst->inst) {
             // Potentially removed mem dep entries could be on this list
-//            inst_entry->dependInsts[i] = NULL;
             continue;
         }
 
@@ -429,7 +424,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
         } else {
             woken_inst->memDepReady = true;
         }
-//        inst_entry->dependInsts[i] = NULL;
     }
 
     inst_entry->dependInsts.clear();
@@ -468,13 +462,7 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
         assert(hash_it != memDepHash.end());
 
         (*hash_it).second->squashed = true;
-/*
-        for (int i = 0; i < (*hash_it).second->dependInsts.size(); ++i) {
-            (*hash_it).second->dependInsts[i] = NULL;
-        }
 
-        (*hash_it).second->inst = NULL;
-*/
         (*hash_it).second = NULL;
 
         memDepHash.erase(hash_it);
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index dd2cb0c18..3f1a27bb5 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,16 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultRename handles both single threaded and SMT rename. Its width is
- * specified by the parameters; each cycle it tries to rename that many
- * instructions. It holds onto the rename history of all instructions with
- * destination registers, storing the arch. register, the new physical
- * register, and the old physical register, to allow for undoing of mappings
- * if squashing happens, or freeing up registers upon commit. Rename handles
- * blocking if the ROB, IQ, or LSQ is going to be full. Rename also handles
- * barriers, and does so by stalling on the instruction until the ROB is
- * empty and there are no instructions in flight to the ROB.
+ * DefaultRename handles both single threaded and SMT rename. Its
+ * width is specified by the parameters; each cycle it tries to rename
+ * that many instructions. It holds onto the rename history of all
+ * instructions with destination registers, storing the
+ * arch. register, the new physical register, and the old physical
+ * register, to allow for undoing of mappings if squashing happens, or
+ * freeing up registers upon commit. Rename handles blocking if the
+ * ROB, IQ, or LSQ is going to be full. Rename also handles barriers,
+ * and does so by stalling on the instruction until the ROB is empty
+ * and there are no instructions in flight to the ROB.
  */
 template<class Impl>
 class DefaultRename
@@ -68,14 +69,15 @@ class DefaultRename
     // Typedefs from the ISA.
     typedef TheISA::RegIndex RegIndex;
 
-    // A deque is used to queue the instructions.  Barrier insts must be
-    // added to the front of the deque, which is the only reason for using
-    // a deque instead of a queue. (Most other stages use a queue)
+    // A list is used to queue the instructions.  Barrier insts must
+    // be added to the front of the list, which is the only reason for
+    // using a list instead of a queue. (Most other stages use a
+    // queue)
     typedef std::list<DynInstPtr> InstQueue;
 
   public:
-    /** Overall rename status. Used to determine if the CPU can deschedule
-     * itself due to a lack of activity.
+    /** Overall rename status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
      */
     enum RenameStatus {
         Active,
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index db4bb2ffe..081581c92 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -209,17 +209,13 @@ template <class Impl>
 void
 DefaultRename<Impl>::initStage()
 {
+    // Grab the number of free entries directly from the stages.
     for (int tid=0; tid < numThreads; tid++) {
         freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
         freeEntries[tid].lsqEntries = iew_ptr->ldstQueue.numFreeEntries(tid);
         freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
         emptyROB[tid] = true;
     }
-
-    // Clear these pointers so they are not accidentally used in
-    // non-initialization code.
-//    iew_ptr = NULL;
-//    commit_ptr = NULL;
 }
 
 template<class Impl>
@@ -299,6 +295,7 @@ DefaultRename<Impl>::takeOverFrom()
     _status = Inactive;
     initStage();
 
+    // Reset all state prior to taking over from the other CPU.
     for (int i=0; i< numThreads; i++) {
         renameStatus[i] = Idle;
 
@@ -326,7 +323,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     if (renameStatus[tid] == Blocked ||
         renameStatus[tid] == Unblocking ||
         renameStatus[tid] == SerializeStall) {
-#if !FULL_SYSTEM
+#if 0
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
@@ -344,7 +341,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     // Set the status to Squashing.
     renameStatus[tid] = Squashing;
 
-    // Clear the skid buffer in case it has any data in it.
+    // Squash any instructions from decode.
     unsigned squashCount = 0;
 
     for (int i=0; i<fromDecode->size; i++) {
@@ -367,9 +364,6 @@ template <class Impl>
 void
 DefaultRename<Impl>::tick()
 {
-    // Rename will need to try to rename as many instructions as it
-    // has bandwidth, unless it is blocked.
-
     wroteToTimeBuffer = false;
 
     blockThisCycle = false;
@@ -454,8 +448,6 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
     } else if (renameStatus[tid] == Unblocking) {
         renameInsts(tid);
 
-//        ++renameUnblockCycles;
-
         if (validInsts()) {
             // Add the current inputs to the skid buffer so they can be
             // reprocessed when this stage unblocks.
@@ -575,7 +567,6 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
 
         insts_to_rename.pop_front();
 
-        //Use skidBuffer with oldest instructions
         if (renameStatus[tid] == Unblocking) {
             DPRINTF(Rename,"[tid:%u]: Removing [sn:%lli] PC:%#x from rename "
                     "skidBuffer\n",
@@ -711,10 +702,10 @@ void
 DefaultRename<Impl>::sortInsts()
 {
     int insts_from_decode = fromDecode->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_decode; ++i) {
         DynInstPtr inst = fromDecode->insts[i];
         insts[inst->threadNumber].push_back(inst);
@@ -794,8 +785,8 @@ DefaultRename<Impl>::block(unsigned tid)
             wroteToTimeBuffer = true;
         }
 
-        // Rename can not go from SerializeStall to Blocked, otherwise it would
-        // not know to complete the serialize stall.
+        // Rename can not go from SerializeStall to Blocked, otherwise
+        // it would not know to complete the serialize stall.
         if (renameStatus[tid] != SerializeStall) {
             // Set status to Blocked.
             renameStatus[tid] = Blocked;
@@ -835,15 +826,11 @@ DefaultRename<Impl>::doSquash(unsigned tid)
 
     InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
 
-//#if FULL_SYSTEM
-//    assert(!historyBuffer[tid].empty());
-//#else
     // After a syscall squashes everything, the history buffer may be empty
     // but the ROB may still be squashing instructions.
     if (historyBuffer[tid].empty()) {
         return;
     }
-//#endif // FULL_SYSTEM
 
     // Go through the most recent instructions, undoing the mappings
     // they did and freeing up the registers.
@@ -896,8 +883,8 @@ DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, unsigned tid)
            hb_it != historyBuffer[tid].end() &&
            (*hb_it).instSeqNum <= inst_seq_num) {
 
-        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, sequence"
-                " number %i.\n",
+        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, "
+                "[sn:%lli].\n",
                 tid, (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
 
         freeList->addReg((*hb_it).prevPhysReg);
diff --git a/cpu/o3/rename_map.cc b/cpu/o3/rename_map.cc
index 8ba632e65..fc59058a1 100644
--- a/cpu/o3/rename_map.cc
+++ b/cpu/o3/rename_map.cc
@@ -32,18 +32,12 @@
 
 using namespace std;
 
-// Todo: Consider making functions inline.  Avoid having things that are
-// using the zero register or misc registers from adding on the registers
-// to the free list.  Possibly remove the direct communication between
-// this and the freelist.  Considering making inline bool functions that
-// determine if the register is a logical int, logical fp, physical int,
-// physical fp, etc.
+// @todo: Consider making inline bool functions that determine if the
+// register is a logical int, logical fp, physical int, physical fp,
+// etc.
 
 SimpleRenameMap::~SimpleRenameMap()
 {
-    // Delete the rename maps as they were allocated with new.
-    //delete [] intRenameMap;
-    //delete [] floatRenameMap;
 }
 
 void
@@ -105,7 +99,8 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
         // Although the index refers purely to architected registers, because
         // the floating reg indices come after the integer reg indices, they
         // may exceed the size of a normal RegIndex (short).
-        for (PhysRegIndex index = numLogicalIntRegs; index < numLogicalRegs; ++index)
+        for (PhysRegIndex index = numLogicalIntRegs;
+             index < numLogicalRegs; ++index)
         {
             floatRenameMap[index].physical_reg = freg_idx++;
         }
@@ -132,14 +127,10 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
 void
 SimpleRenameMap::setFreeList(SimpleFreeList *fl_ptr)
 {
-    //Setup the interface to the freelist.
     freeList = fl_ptr;
 }
 
 
-// Don't allow this stage to fault; force that check to the rename stage.
-// Simply ask to rename a logical register and get back a new physical
-// register index.
 SimpleRenameMap::RenameInfo
 SimpleRenameMap::rename(RegIndex arch_reg)
 {
@@ -152,13 +143,11 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // requested architected register.
         prev_reg = intRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != intZeroReg) {
-            // Get a free physical register to rename to.
             renamed_reg = freeList->getIntReg();
 
-            // Update the integer rename map.
             intRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
@@ -168,20 +157,15 @@ SimpleRenameMap::rename(RegIndex arch_reg)
             renamed_reg = intZeroReg;
         }
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base offset for floating point registers.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         // Record the current physical register that is renamed to the
         // requested architected register.
         prev_reg = floatRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != floatZeroReg) {
-            // Get a free floating point register to rename to.
             renamed_reg = freeList->getFloatReg();
 
-            // Update the floating point rename map.
             floatRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg < numPhysicalRegs &&
@@ -194,10 +178,10 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // Subtract off the base offset for miscellaneous registers.
         arch_reg = arch_reg - numLogicalRegs;
 
-        // No renaming happens to the misc. registers.  They are simply the
-        // registers that come after all the  physical registers; thus
-        // take the base architected register and add the physical registers
-        // to it.
+        // No renaming happens to the misc. registers.  They are
+        // simply the registers that come after all the physical
+        // registers; thus take the base architected register and add
+        // the physical registers to it.
         renamed_reg = arch_reg + numPhysicalRegs;
 
         // Set the previous register to the same register; mainly it must be
@@ -211,17 +195,12 @@ SimpleRenameMap::rename(RegIndex arch_reg)
     return RenameInfo(renamed_reg, prev_reg);
 }
 
-//Perhaps give this a pair as a return value, of the physical register
-//and whether or not it's ready.
 PhysRegIndex
 SimpleRenameMap::lookup(RegIndex arch_reg)
 {
     if (arch_reg < numLogicalIntRegs) {
         return intRenameMap[arch_reg].physical_reg;
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base FP offset.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         return floatRenameMap[arch_reg].physical_reg;
     } else {
         // Subtract off the misc registers offset.
@@ -233,51 +212,23 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
     }
 }
 
-// In this implementation the miscellaneous registers do not actually rename,
-// so this function does not allow you to try to change their mappings.
 void
 SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
 {
+    // In this implementation the miscellaneous registers do not
+    // actually rename, so this function does not allow you to try to
+    // change their mappings.
     if (arch_reg < numLogicalIntRegs) {
         DPRINTF(Rename, "Rename Map: Integer register %i being set to %i.\n",
                 (int)arch_reg, renamed_reg);
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
     } else if (arch_reg < numLogicalIntRegs + numLogicalFloatRegs) {
-
-
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
                 (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
-
-    //assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
-}
-
-void
-SimpleRenameMap::squash(vector<RegIndex> freed_regs,
-                        vector<UnmapInfo> unmaps)
-{
-    panic("Not sure this function should be called.");
-
-    // Not sure the rename map should be able to access the free list
-    // like this.
-    while (!freed_regs.empty()) {
-        RegIndex free_register = freed_regs.back();
-
-        if (free_register < numPhysicalIntRegs) {
-            freeList->addIntReg(free_register);
-        } else {
-            // Subtract off the base FP dependence tag.
-            free_register = free_register - numPhysicalIntRegs;
-            freeList->addFloatReg(free_register);
-        }
-
-        freed_regs.pop_back();
-    }
-
-    // Take unmap info and roll back the rename map.
 }
 
 int
diff --git a/cpu/o3/rename_map.hh b/cpu/o3/rename_map.hh
index 3ecbe45c3..d7e49ae83 100644
--- a/cpu/o3/rename_map.hh
+++ b/cpu/o3/rename_map.hh
@@ -101,9 +101,6 @@ class SimpleRenameMap
      */
     void setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg);
 
-    void squash(std::vector<RegIndex> freed_regs,
-                std::vector<UnmapInfo> unmaps);
-
     int numFreeEntries();
 
   private:
@@ -153,7 +150,7 @@ class SimpleRenameMap
     };
 
     //Change this to private
-  public:
+  private:
     /** Integer rename map. */
     std::vector<RenameEntry> intRenameMap;
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 0748850ea..e05eebe5a 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,9 +53,7 @@ class ROB
     enum Status {
         Running,
         Idle,
-        ROBSquashing,
-        DcacheMissStall,
-        DcacheMissComplete
+        ROBSquashing
     };
 
     /** SMT ROB Sharing Policy */
@@ -112,7 +110,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInstPtr readHeadInst();
+//    DynInstPtr readHeadInst();
 
     /** Returns a pointer to the head instruction of a specific thread within
      *  the ROB.
@@ -124,7 +122,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the tail of the ROB.
      */
-    DynInstPtr readTailInst();
+//    DynInstPtr readTailInst();
 
     /** Returns a pointer to the tail instruction of a specific thread within
      *  the ROB.
@@ -133,7 +131,7 @@ class ROB
     DynInstPtr readTailInst(unsigned tid);
 
     /** Retires the head instruction, removing it from the ROB. */
-    void retireHead();
+//    void retireHead();
 
     /** Retires the head instruction of a specific thread, removing it from the
      *  ROB.
@@ -141,7 +139,7 @@ class ROB
     void retireHead(unsigned tid);
 
     /** Is the oldest instruction across all threads ready. */
-    bool isHeadReady();
+//    bool isHeadReady();
 
     /** Is the oldest instruction across a particular thread ready. */
     bool isHeadReady(unsigned tid);
@@ -200,35 +198,35 @@ class ROB
     void updateTail();
 
     /** Reads the PC of the oldest head instruction. */
-    uint64_t readHeadPC();
+//    uint64_t readHeadPC();
 
     /** Reads the PC of the head instruction of a specific thread. */
-    uint64_t readHeadPC(unsigned tid);
+//    uint64_t readHeadPC(unsigned tid);
 
     /** Reads the next PC of the oldest head instruction. */
-    uint64_t readHeadNextPC();
+//    uint64_t readHeadNextPC();
 
     /** Reads the next PC of the head instruction of a specific thread. */
-    uint64_t readHeadNextPC(unsigned tid);
+//    uint64_t readHeadNextPC(unsigned tid);
 
     /** Reads the sequence number of the oldest head instruction. */
-    InstSeqNum readHeadSeqNum();
+//    InstSeqNum readHeadSeqNum();
 
     /** Reads the sequence number of the head instruction of a specific thread.
      */
-    InstSeqNum readHeadSeqNum(unsigned tid);
+//    InstSeqNum readHeadSeqNum(unsigned tid);
 
     /** Reads the PC of the youngest tail instruction. */
-    uint64_t readTailPC();
+//    uint64_t readTailPC();
 
     /** Reads the PC of the tail instruction of a specific thread. */
-    uint64_t readTailPC(unsigned tid);
+//    uint64_t readTailPC(unsigned tid);
 
     /** Reads the sequence number of the youngest tail instruction. */
-    InstSeqNum readTailSeqNum();
+//    InstSeqNum readTailSeqNum();
 
     /** Reads the sequence number of tail instruction of a specific thread. */
-    InstSeqNum readTailSeqNum(unsigned tid);
+//    InstSeqNum readTailSeqNum(unsigned tid);
 
     /** Checks if the ROB is still in the process of squashing instructions.
      *  @retval Whether or not the ROB is done squashing.
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 02a4bfbee..25e0c80fd 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,20 +201,15 @@ template <class Impl>
 void
 ROB<Impl>::insertInst(DynInstPtr &inst)
 {
-    // Make sure we have the right number of instructions.
     //assert(numInstsInROB == countInsts());
-
-    // Make sure the instruction is valid.
     assert(inst);
 
     DPRINTF(ROB, "Adding inst PC %#x to the ROB.\n", inst->readPC());
 
-    // If the ROB is full then exit.
     assert(numInstsInROB != numEntries);
 
     int tid = inst->threadNumber;
 
-    // Place into ROB
     instList[tid].push_back(inst);
 
     //Set Up head iterator if this is the 1st instruction in the ROB
@@ -228,10 +223,8 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
     tail = instList[tid].end();
     tail--;
 
-    // Mark as set in ROB
     inst->setInROB();
 
-    // Increment ROB count
     ++numInstsInROB;
     ++threadEntries[tid];
 
@@ -242,6 +235,7 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
 
 // Whatever calls this function needs to ensure that it properly frees up
 // registers prior to this function.
+/*
 template <class Impl>
 void
 ROB<Impl>::retireHead()
@@ -249,7 +243,6 @@ ROB<Impl>::retireHead()
     //assert(numInstsInROB == countInsts());
     assert(numInstsInROB > 0);
 
-    // Get the head ROB instruction's TID.
     int tid = (*head)->threadNumber;
 
     retireHead(tid);
@@ -258,6 +251,7 @@ ROB<Impl>::retireHead()
         tail = instList[tid].end();
     }
 }
+*/
 
 template <class Impl>
 void
@@ -271,18 +265,15 @@ ROB<Impl>::retireHead(unsigned tid)
 
     DynInstPtr head_inst = (*head_it);
 
-    // Make certain this can retire.
     assert(head_inst->readyToCommit());
 
     DPRINTF(ROB, "[tid:%u]: Retiring head instruction, "
             "instruction PC %#x,[sn:%lli]\n", tid, head_inst->readPC(),
             head_inst->seqNum);
 
-    // Keep track of how many instructions are in the ROB.
     --numInstsInROB;
     --threadEntries[tid];
 
-    //Mark DynInstFlags
     head_inst->removeInROB();
     head_inst->setCommitted();
 
@@ -291,12 +282,12 @@ ROB<Impl>::retireHead(unsigned tid)
     //Update "Global" Head of ROB
     updateHead();
 
-    // A special case is needed if the instruction being retired is the
-    // only instruction in the ROB; otherwise the tail iterator will become
-    // invalidated.
+    // @todo: A special case is needed if the instruction being
+    // retired is the only instruction in the ROB; otherwise the tail
+    // iterator will become invalidated.
     cpu->removeFrontInst(head_inst);
 }
-
+/*
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady()
@@ -307,7 +298,7 @@ ROB<Impl>::isHeadReady()
 
     return false;
 }
-
+*/
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady(unsigned tid)
@@ -537,7 +528,7 @@ ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
         doSquash(tid);
     }
 }
-
+/*
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst()
@@ -549,7 +540,7 @@ ROB<Impl>::readHeadInst()
         return dummyInst;
     }
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst(unsigned tid)
@@ -564,7 +555,7 @@ ROB<Impl>::readHeadInst(unsigned tid)
         return dummyInst;
     }
 }
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
@@ -608,7 +599,6 @@ ROB<Impl>::readHeadNextPC(unsigned tid)
     return (*head_thread)->readNextPC();
 }
 
-
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
@@ -637,7 +627,7 @@ ROB<Impl>::readTailInst()
 
     return (*tail);
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readTailInst(unsigned tid)
@@ -650,7 +640,7 @@ ROB<Impl>::readTailInst(unsigned tid)
     return *tail_thread;
 }
 
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
@@ -698,4 +688,4 @@ ROB<Impl>::readTailSeqNum(unsigned tid)
 
     return (*tail_thread)->seqNum;
 }
-
+*/
diff --git a/cpu/o3/scoreboard.cc b/cpu/o3/scoreboard.cc
index 87b0aee94..b0e433620 100644
--- a/cpu/o3/scoreboard.cc
+++ b/cpu/o3/scoreboard.cc
@@ -99,6 +99,7 @@ Scoreboard::unsetReg(PhysRegIndex ready_reg)
     if (ready_reg == zeroRegIdx ||
         ready_reg == (zeroRegIdx + numPhysicalIntRegs)) {
         // Don't do anything if int or fp zero reg.
+        return;
     }
 
     regScoreBoard[ready_reg] = 0;
diff --git a/cpu/o3/store_set.cc b/cpu/o3/store_set.cc
index a685646f3..0c957c8c7 100644
--- a/cpu/o3/store_set.cc
+++ b/cpu/o3/store_set.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -278,11 +278,6 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 void
 StoreSet::squash(InstSeqNum squashed_num, unsigned tid)
 {
-    // Not really sure how to do this well.
-    // Generally this is small enough that it should be okay; short circuit
-    // evaluation should take care of invalid entries.
-    // Maybe keep a list of valid LFST's?  Really ugly either way...
-
     DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
             squashed_num);
 
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
index 17719bdeb..2c9788e4b 100644
--- a/cpu/o3/thread_state.hh
+++ b/cpu/o3/thread_state.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_O3_THREAD_STATE_HH__
 #define __CPU_O3_THREAD_STATE_HH__
@@ -15,27 +42,17 @@ class EndQuiesceEvent;
 class FunctionProfile;
 class ProfileNode;
 #else
-class Process;
 class FunctionalMemory;
+class Process;
 #endif
 
-// In the new CPU case this may be quite small...It depends on what I define
-// ThreadState to be.  Currently it's only the state that exists within
-// ExecContext basically.  Leaves the interface and manipulation up to the
-// CPU.  Not sure this is useful/flexible...probably can be if I can avoid
-// including state here that parts of the pipeline can't modify directly,
-// or at least don't let them.  The only problem is for state that's needed
-// per thread, per structure.  I.e. rename table, memreqs.
-// On the other hand, it might be nice to not have to pay the extra pointer
-// lookup to get frequently used state such as a memreq (that isn't used much
-// elsewhere)...
-
-// Maybe this ozone thread state should only really have committed state?
-// I need to think about why I'm using this and what it's useful for.  Clearly
-// has benefits for SMT; basically serves same use as CPUExecContext.
-// Makes the ExecContext proxy easier.  Gives organization/central access point
-// to state of a thread that can be accessed normally (i.e. not in-flight
-// stuff within a OoO processor).  Does this need an XC proxy within it?
+/**
+ * Class that has various thread state, such as the status, the
+ * current instruction being processed, whether or not the thread has
+ * a trap pending or is being externally updated, the ExecContext
+ * proxy pointer, etc.  It also handles anything related to a specific
+ * thread's process, such as syscalls and checking valid addresses.
+ */
 template <class Impl>
 struct O3ThreadState : public ThreadState {
     typedef ExecContext::Status Status;
@@ -43,7 +60,7 @@ struct O3ThreadState : public ThreadState {
 
     Status _status;
 
-    // Current instruction?
+    // Current instruction
     TheISA::MachInst inst;
   private:
     FullCPU *cpu;
@@ -80,51 +97,11 @@ struct O3ThreadState : public ThreadState {
     void setStatus(Status new_status) { _status = new_status; }
 
 #if !FULL_SYSTEM
-
-    Fault dummyTranslation(MemReqPtr &req)
-    {
-#if 0
-        assert((req->vaddr >> 48 & 0xffff) == 0);
-#endif
-
-        // put the asid in the upper 16 bits of the paddr
-        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
-        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
-        return NoFault;
-    }
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-
     bool validInstAddr(Addr addr)
     { return process->validInstAddr(addr); }
 
     bool validDataAddr(Addr addr)
     { return process->validDataAddr(addr); }
-#else
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return cpu->itb->translate(req);
-    }
-
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, false);
-    }
-
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, true);
-    }
 #endif
 
     bool misspeculating() { return false; }
-- 
cgit v1.2.3


From eeeee7c58f26fac9fe9b8606e26ef8e99a28e399 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 14:38:16 -0400
Subject: Add extra flags to help new CPU handle various instructions.
 IsIprAccess flag may go away in the future (op class can be used to tell
 this), and the CPU still needs a specific way to identify/deal with syscalls.

arch/alpha/isa/decoder.isa:
    Added a few extra flags to help the new CPU identify various classes of instructions without having to force certain behaviors for all CPUs.
cpu/base_dyn_inst.hh:
cpu/static_inst.hh:
    Added extra flags.
cpu/o3/iew_impl.hh:
cpu/o3/inst_queue_impl.hh:
    Handle store conditionals specially.
cpu/o3/lsq_unit_impl.hh:
    Extra flags tells if the instruction is a store conditional.
cpu/o3/rename_impl.hh:
    Handle IPR accesses and store conditionals specially.

--HG--
extra : convert_revision : 39debec4fa5341ae8a8ab5650bd12730aeb6c04f
---
 cpu/o3/iew_impl.hh        |  8 ++++----
 cpu/o3/inst_queue_impl.hh |  1 +
 cpu/o3/lsq_unit_impl.hh   |  7 +++----
 cpu/o3/rename_impl.hh     | 12 ++++++++++--
 4 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 59f4055a6..cf28f2efc 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1100,10 +1100,10 @@ DefaultIEW<Impl>::dispatchInsts(unsigned tid)
 
             ++iewDispStoreInsts;
 
-            if (inst->isNonSpeculative()) {
-                // Non-speculative stores (namely store conditionals)
-                // need to be set as "canCommit()" so that commit can
-                // process them when they reach the head of commit.
+            if (inst->isStoreConditional()) {
+                // Store conditionals need to be set as "canCommit()"
+                // so that commit can process them when they reach the
+                // head of commit.
                 inst->setCanCommit();
                 instQueue.insertNonSpec(inst);
                 add_to_iq = false;
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index ed57ac257..71541b4f8 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -1041,6 +1041,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid)
 
             // Remove the instruction from the dependency list.
             if (!squashed_inst->isNonSpeculative() &&
+                !squashed_inst->isStoreConditional() &&
                 !squashed_inst->isMemBarrier() &&
                 !squashed_inst->isWriteBarrier()) {
 
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index f0b4405ed..7974ddaad 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -424,10 +424,9 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (store_inst->isNonSpeculative()) {
-        // Nonspeculative accesses (namely store conditionals)
-        // need to set themselves as able to writeback if we
-        // haven't had a fault by here.
+    if (store_inst->isStoreConditional()) {
+        // Store conditionals need to set themselves as able to
+        // writeback if we haven't had a fault by here.
         storeQueue[store_idx].canWB = true;
 
         ++storesToWB;
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index 081581c92..b4f1077d1 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -594,7 +594,14 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
         // serializeAfter marks the next instruction as serializeBefore.
         // serializeBefore makes the instruction wait in rename until the ROB
         // is empty.
-        if (inst->isSerializeBefore() && !inst->isSerializeHandled()) {
+
+        // In this model, IPR accesses are serialize before
+        // instructions, and store conditionals are serialize after
+        // instructions.  This is mainly due to lack of support for
+        // out-of-order operations of either of those classes of
+        // instructions.
+        if ((inst->isIprAccess() || inst->isSerializeBefore()) &&
+            !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize before instruction encountered.\n");
 
             if (!inst->isTempSerializeBefore()) {
@@ -613,7 +620,8 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
             blockThisCycle = true;
 
             break;
-        } else if (inst->isSerializeAfter() && !inst->isSerializeHandled()) {
+        } else if ((inst->isStoreConditional() || inst->isSerializeAfter()) &&
+                   !inst->isSerializeHandled()) {
             DPRINTF(Rename, "Serialize after instruction encountered.\n");
 
             renamedSerializing++;
-- 
cgit v1.2.3


From ff3d16ca1f7d83ce7932868d2bf1cb3e526562ea Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 16:51:16 -0400
Subject: Move kernel stats out of CPU and into XC.

arch/alpha/ev5.cc:
    Move kernel stats out of CPU and into XC.  Also be sure to check if the kernel stats exist prior to using them.

--HG--
extra : convert_revision : 565cd7026410fd7d8586f953d9b328c2e67a9473
---
 cpu/o3/alpha_cpu.hh      | 11 +++++----
 cpu/o3/alpha_cpu_impl.hh | 62 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 23 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index 78ad5f7d8..5c89e3462 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -35,6 +35,9 @@
 #include "sim/byteswap.hh"
 
 class EndQuiesceEvent;
+namespace Kernel {
+    class Statistics;
+};
 
 template <class Impl>
 class AlphaFullCPU : public FullO3CPU<Impl>
@@ -60,11 +63,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         O3ThreadState<Impl> *thread;
 
-        Tick lastActivate;
-        Tick lastSuspend;
-
-        EndQuiesceEvent *quiesceEvent;
-
         virtual BaseCPU *getCpuPtr() { return cpu; }
 
         virtual void setCpuId(int id) { cpu->cpu_id = id; }
@@ -81,6 +79,9 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual AlphaITB *getITBPtr() { return cpu->itb; }
 
         virtual AlphaDTB * getDTBPtr() { return cpu->dtb; }
+
+        virtual Kernel::Statistics *getKernelStats()
+        { return thread->kernelStats; }
 #else
         virtual Process *getProcessPtr() { return thread->process; }
 #endif
diff --git a/cpu/o3/alpha_cpu_impl.hh b/cpu/o3/alpha_cpu_impl.hh
index 58b2b3548..91cd3d9e6 100644
--- a/cpu/o3/alpha_cpu_impl.hh
+++ b/cpu/o3/alpha_cpu_impl.hh
@@ -31,7 +31,6 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/checker/exec_context.hh"
-#include "cpu/quiesce_event.hh"
 #include "mem/mem_interface.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
@@ -44,6 +43,8 @@
 #if FULL_SYSTEM
 #include "arch/alpha/osfpal.hh"
 #include "arch/isa_traits.hh"
+#include "cpu/quiesce_event.hh"
+#include "kern/kernel_stats.hh"
 #endif
 
 using namespace TheISA;
@@ -101,11 +102,12 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params *params)
         alpha_xc_proxy->cpu = this;
         alpha_xc_proxy->thread = this->thread[i];
 
-        alpha_xc_proxy->quiesceEvent =
+#if FULL_SYSTEM
+        this->thread[i]->quiesceEvent =
             new EndQuiesceEvent(xc_proxy);
-        alpha_xc_proxy->lastActivate = 0;
-        alpha_xc_proxy->lastSuspend = 0;
-
+        this->thread[i]->lastActivate = 0;
+        this->thread[i]->lastSuspend = 0;
+#endif
         this->thread[i]->xcProxy = xc_proxy;
 
         this->execContexts.push_back(xc_proxy);
@@ -181,6 +183,9 @@ AlphaFullCPU<Impl>::AlphaXC::takeOverFrom(ExecContext *old_context)
     if (thread->quiesceEvent) {
         thread->quiesceEvent->xc = this;
     }
+
+    // Transfer kernel stats from one CPU to the other.
+    thread->kernelStats = old_context->getKernelStats();
 //    storeCondFailures = 0;
     cpu->lockFlag = false;
 #endif
@@ -200,7 +205,9 @@ AlphaFullCPU<Impl>::AlphaXC::activate(int delay)
     if (thread->status() == ExecContext::Active)
         return;
 
-    lastActivate = curTick;
+#if FULL_SYSTEM
+    thread->lastActivate = curTick;
+#endif
 
     if (thread->status() == ExecContext::Unallocated) {
         cpu->activateWhenReady(thread->tid);
@@ -222,8 +229,10 @@ AlphaFullCPU<Impl>::AlphaXC::suspend()
     if (thread->status() == ExecContext::Suspended)
         return;
 
-    lastActivate = curTick;
-    lastSuspend = curTick;
+#if FULL_SYSTEM
+    thread->lastActivate = curTick;
+    thread->lastSuspend = curTick;
+#endif
 /*
 #if FULL_SYSTEM
     // Don't change the status from active if there are pending interrupts
@@ -266,38 +275,55 @@ AlphaFullCPU<Impl>::AlphaXC::halt()
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::regStats(const std::string &name)
-{}
+{
+#if FULL_SYSTEM
+    thread->kernelStats = new Kernel::Statistics(cpu->system);
+    thread->kernelStats->regStats(name + ".kern");
+#endif
+}
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::serialize(std::ostream &os)
-{}
+{
+#if FULL_SYSTEM
+    if (thread->kernelStats)
+        thread->kernelStats->serialize(os);
+#endif
+
+}
 
 template <class Impl>
 void
 AlphaFullCPU<Impl>::AlphaXC::unserialize(Checkpoint *cp, const std::string &section)
-{}
+{
+#if FULL_SYSTEM
+    if (thread->kernelStats)
+        thread->kernelStats->unserialize(cp, section);
+#endif
+
+}
 
 #if FULL_SYSTEM
 template <class Impl>
 EndQuiesceEvent *
 AlphaFullCPU<Impl>::AlphaXC::getQuiesceEvent()
 {
-    return quiesceEvent;
+    return thread->quiesceEvent;
 }
 
 template <class Impl>
 Tick
 AlphaFullCPU<Impl>::AlphaXC::readLastActivate()
 {
-    return lastActivate;
+    return thread->lastActivate;
 }
 
 template <class Impl>
 Tick
 AlphaFullCPU<Impl>::AlphaXC::readLastSuspend()
 {
-    return lastSuspend;
+    return thread->lastSuspend;
 }
 
 template <class Impl>
@@ -595,7 +621,7 @@ AlphaFullCPU<Impl>::hwrei(unsigned tid)
     // Need to clear the lock flag upon returning from an interrupt.
     this->lockFlag = false;
 
-    this->kernelStats->hwrei();
+    this->thread[tid]->kernelStats->hwrei();
 
     this->checkInterrupts = true;
 
@@ -607,9 +633,9 @@ template <class Impl>
 bool
 AlphaFullCPU<Impl>::simPalCheck(int palFunc, unsigned tid)
 {
-    if (this->kernelStats)
-        this->kernelStats->callpal(palFunc,
-                                   this->execContexts[tid]);
+    if (this->thread[tid]->kernelStats)
+        this->thread[tid]->kernelStats->callpal(palFunc,
+                                                this->execContexts[tid]);
 
     switch (palFunc) {
       case PAL::halt:
-- 
cgit v1.2.3


From 358cf1b11765024309fe986262bb3a3d16c8a720 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Tue, 23 May 2006 17:03:43 -0400
Subject: Rework how instructions are scheduled and executed. The "execute"
 portion of IEW is really just the last cycle of execution, at which point
 execute() gets called.  Execution begins inside the IQ, when it schedules FUs
 for specific instructions.  As a result, the Execute stage should just pull
 all completing instructions out of the IQ stage and execute them. Limiting
 the number of writebacks outstanding must still be done.

cpu/o3/iew_impl.hh:
    Rework how instructions are scheduled and executed.  There shouldn't be a specific "width" from issue to execute because issue does the scheduling of the functional units (really the beginning of the execution).
cpu/o3/inst_queue.hh:
cpu/o3/inst_queue_impl.hh:
    Rework how instructions are scheduled and executed.

--HG--
extra : convert_revision : bbf1a8a4c0a2f2a938bdd78d74493048fd3b4b55
---
 cpu/o3/iew_impl.hh        |  5 +++--
 cpu/o3/inst_queue.hh      |  4 ++++
 cpu/o3/inst_queue_impl.hh | 22 +++++++++++++++++-----
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'cpu/o3')

diff --git a/cpu/o3/iew_impl.hh b/cpu/o3/iew_impl.hh
index 59f4055a6..c22850131 100644
--- a/cpu/o3/iew_impl.hh
+++ b/cpu/o3/iew_impl.hh
@@ -1232,13 +1232,14 @@ DefaultIEW<Impl>::executeInsts()
 #endif
 
     // Execute/writeback any instructions that are available.
+    int insts_to_execute = fromIssue->size;
     int inst_num = 0;
-    for ( ; inst_num < issueWidth && fromIssue->insts[inst_num];
+    for (; inst_num < insts_to_execute;
           ++inst_num) {
 
         DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
-        DynInstPtr inst = fromIssue->insts[inst_num];
+        DynInstPtr inst = instQueue.getInstToExecute();
 
         DPRINTF(IEW, "Execute: Processing PC %#x, [tid:%i] [sn:%i].\n",
                 inst->readPC(), inst->threadNumber,inst->seqNum);
diff --git a/cpu/o3/inst_queue.hh b/cpu/o3/inst_queue.hh
index 6bdf4ddc2..518de73d9 100644
--- a/cpu/o3/inst_queue.hh
+++ b/cpu/o3/inst_queue.hh
@@ -171,6 +171,8 @@ class InstructionQueue
      */
     void insertBarrier(DynInstPtr &barr_inst);
 
+    DynInstPtr getInstToExecute();
+
     /**
      * Records the instruction as the producer of a register without
      * adding it to the rest of the IQ.
@@ -272,6 +274,8 @@ class InstructionQueue
     /** List of all the instructions in the IQ (some of which may be issued). */
     std::list<DynInstPtr> instList[Impl::MaxThreads];
 
+    std::list<DynInstPtr> instsToExecute;
+
     /**
      * Struct for comparing entries to be added to the priority queue.  This
      * gives reverse ordering to the instructions in terms of sequence
diff --git a/cpu/o3/inst_queue_impl.hh b/cpu/o3/inst_queue_impl.hh
index ed57ac257..412d59768 100644
--- a/cpu/o3/inst_queue_impl.hh
+++ b/cpu/o3/inst_queue_impl.hh
@@ -588,6 +588,16 @@ InstructionQueue<Impl>::insertBarrier(DynInstPtr &barr_inst)
     insertNonSpec(barr_inst);
 }
 
+template <class Impl>
+typename Impl::DynInstPtr
+InstructionQueue<Impl>::getInstToExecute()
+{
+    assert(!instsToExecute.empty());
+    DynInstPtr inst = instsToExecute.front();
+    instsToExecute.pop_front();
+    return inst;
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::addToOrderList(OpClass op_class)
@@ -662,9 +672,11 @@ InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx)
     // @todo: This could break if there's multiple multi-cycle ops
     // finishing on this cycle.  Maybe implement something like
     // instToCommit in iew_impl.hh.
-    int &size = issueToExecuteQueue->access(0)->size;
+    issueToExecuteQueue->access(0)->size++;
+    instsToExecute.push_back(inst);
+//    int &size = issueToExecuteQueue->access(0)->size;
 
-    issueToExecuteQueue->access(0)->insts[size++] = inst;
+//    issueToExecuteQueue->access(0)->insts[size++] = inst;
 }
 
 // @todo: Figure out a better way to remove the squashed items from the
@@ -690,9 +702,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     ListOrderIt order_it = listOrder.begin();
     ListOrderIt order_end_it = listOrder.end();
     int total_issued = 0;
-    int exec_queue_slot = i2e_info->size;
 
-    while (exec_queue_slot < totalWidth && total_issued < totalWidth &&
+    while (total_issued < totalWidth &&
            order_it != order_end_it) {
         OpClass op_class = (*order_it).queueType;
 
@@ -733,8 +744,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         if (idx == -2 || idx != -1) {
             if (op_latency == 1) {
-                i2e_info->insts[exec_queue_slot++] = issuing_inst;
+//                i2e_info->insts[exec_queue_slot++] = issuing_inst;
                 i2e_info->size++;
+                instsToExecute.push_back(issuing_inst);
 
                 // Add the FU onto the list of FU's to be freed next
                 // cycle if we used one.
-- 
cgit v1.2.3