42 files changed, 2678 insertions, 730 deletions
diff --git a/cpu/base_cpu.cc b/cpu/base_cpu.cc
index 3ee7a3892..988c7a602 100644
--- a/cpu/base_cpu.cc
+++ b/cpu/base_cpu.cc
@@ -37,6 +37,8 @@
 #include "sim/param.hh"
 #include "sim/sim_events.hh"
 
+#include "base/trace.hh"
+
 using namespace std;
 
 vector<BaseCPU *> BaseCPU::cpuList;
@@ -46,6 +48,7 @@ vector<BaseCPU *> BaseCPU::cpuList;
 // been initialized
 int maxThreadsPerCPU = 1;
 
+extern void debug_break();
 #ifdef FULL_SYSTEM
 BaseCPU::BaseCPU(const string &_name, int _number_of_threads,
                  Counter max_insts_any_thread,
@@ -64,9 +67,16 @@ BaseCPU::BaseCPU(const string &_name, int _number_of_threads,
     : SimObject(_name), number_of_threads(_number_of_threads)
 #endif
 {
+    DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this);
+
+    debug_break();
+
     // add self to global list of CPUs
     cpuList.push_back(this);
 
+    DPRINTF(FullCPU, "BaseCPU: CPU added to cpuList, mem address %#x.\n",
+            this);
+
     if (number_of_threads > maxThreadsPerCPU)
         maxThreadsPerCPU = number_of_threads;
 
diff --git a/cpu/base_dyn_inst.cc b/cpu/base_dyn_inst.cc
index c527eb08b..74f6b8a6c 100644
--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@@ -83,7 +83,7 @@ BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
 
     seqNum = seq_num;
 
-    specMemWrite = false;
+//    specMemWrite = false;
 
     canIssue = false;
     issued = false;
@@ -95,7 +95,7 @@ BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
     blockingInst = false;
     recoverInst = false;
     specMode = false;
-    btbMissed = false;
+//    btbMissed = false;
     // Eventually make this a parameter.
     threadNumber = 0;
     // Also make this a parameter.
@@ -139,12 +139,12 @@ BaseDynInst<Impl>::BaseDynInst(StaticInstPtr<ISA> &_staticInst)
     effAddr = MemReq::inval_addr;
     physEffAddr = MemReq::inval_addr;
 
-    specMemWrite = false;
+//    specMemWrite = false;
 
     blockingInst = false;
     recoverInst = false;
     specMode = false;
-    btbMissed = false;
+//    btbMissed = false;
 
     // Make sure to have the renamed register entries set to the same
     // as the normal register entries.  It will allow the IQ to work
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index fe30b5195..171721e61 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -146,7 +146,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
     bool threadsyncWait;
 
     /** If the BTB missed. */
-    bool btbMissed;
+//    bool btbMissed;
+
+    /** The global history of this instruction (branch). */
+//    unsigned globalHistory;
 
     /** The thread this instruction is from. */
     short threadNumber;
@@ -212,7 +215,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     static int instcount;
 
     /** Did this instruction do a spec write? */
-    bool specMemWrite;
+//    bool specMemWrite;
 
   private:
     /** Physical register index of the destination registers of this
@@ -287,15 +290,22 @@ class BaseDynInst : public FastAlloc, public RefCounted
 
     /** Returns whether the instruction was predicted taken or not. */
     bool predTaken() {
-//        DPRINTF(FullCPU, "PC: %08p\n", PC);
-//        DPRINTF(FullCPU, "predPC: %08p\n", predPC);
-
         return( predPC != (PC + sizeof(MachInst) ) );
     }
 
     /** Returns whether the instruction mispredicted. */
     bool mispredicted() { return (predPC != nextPC); }
 
+/*
+    unsigned readGlobalHist() {
+        return globalHistory;
+    }
+
+    void setGlobalHist(unsigned history) {
+        globalHistory = history;
+    }
+*/
+
     //
     //  Instruction types.  Forward checks to StaticInst object.
     //
@@ -452,7 +462,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
     OpClass opClass() const { return staticInst->opClass(); }
 
     /** Returns whether or not the BTB missed. */
-    bool btbMiss() const { return btbMissed; }
+//    bool btbMiss() const { return btbMissed; }
 
     /** Returns the branch target address. */
     Addr branchTarget() const { return staticInst->branchTarget(PC); }
@@ -579,8 +589,8 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
 
     storeSize = sizeof(T);
     storeData = data;
-    if (specMode)
-        specMemWrite = true;
+//    if (specMode)
+//	specMemWrite = true;
 
     MemReqPtr req = new MemReq(addr, xc, sizeof(T), flags);
 
diff --git a/cpu/beta_cpu/2bit_local_pred.cc b/cpu/beta_cpu/2bit_local_pred.cc
index 88c39a9b0..ef7f23d49 100644
--- a/cpu/beta_cpu/2bit_local_pred.cc
+++ b/cpu/beta_cpu/2bit_local_pred.cc
@@ -75,18 +75,34 @@ DefaultBP::getLocalIndex(Addr &branch_addr)
 bool
 DefaultBP::lookup(Addr &branch_addr)
 {
+    bool taken;
     uint8_t local_prediction;
     unsigned local_predictor_idx = getLocalIndex(branch_addr);
 
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
+    assert(local_predictor_idx < localPredictorSize);
+
     local_prediction = localCtrs[local_predictor_idx].read();
 
     DPRINTF(Fetch, "Branch predictor: prediction is %i.\n",
             (int)local_prediction);
 
-    return getPrediction(local_prediction);
+    taken = getPrediction(local_prediction);
+
+#if 0
+    // Speculative update.
+    if (taken) {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
+        localCtrs[local_predictor_idx].increment();
+    } else {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n");
+        localCtrs[local_predictor_idx].decrement();
+    }
+#endif
+
+    return taken;
 }
 
 void
@@ -100,11 +116,17 @@ DefaultBP::update(Addr &branch_addr, bool taken)
     DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
             local_predictor_idx);
 
+    assert(local_predictor_idx < localPredictorSize);
+
+    // Increment or decrement twice to undo speculative update, then
+    // properly update
     if (taken) {
         DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
         localCtrs[local_predictor_idx].increment();
+//        localCtrs[local_predictor_idx].increment();
     } else {
         DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n");
         localCtrs[local_predictor_idx].decrement();
+//        localCtrs[local_predictor_idx].decrement();
     }
 }
diff --git a/cpu/beta_cpu/alpha_dyn_inst.hh b/cpu/beta_cpu/alpha_dyn_inst.hh
index 4e1cebd11..c964762db 100644
--- a/cpu/beta_cpu/alpha_dyn_inst.hh
+++ b/cpu/beta_cpu/alpha_dyn_inst.hh
@@ -19,19 +19,19 @@ template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
 {
   public:
-    // Typedef for the CPU.
+    /** Typedef for the CPU. */
     typedef typename Impl::FullCPU FullCPU;
 
-    //Typedef to get the ISA.
+    /** Typedef to get the ISA. */
     typedef typename Impl::ISA ISA;
 
-    /// Binary machine instruction type.
+    /** Binary machine instruction type. */
     typedef typename ISA::MachInst MachInst;
-    /// Memory address type.
+    /** Memory address type. */
     typedef typename ISA::Addr	   Addr;
-    /// Logical register index type.
+    /** Logical register index type. */
     typedef typename ISA::RegIndex RegIndex;
-    /// Integer register index type.
+    /** Integer register index type. */
     typedef typename ISA::IntReg   IntReg;
 
     enum {
@@ -54,6 +54,7 @@ class AlphaDynInst : public BaseDynInst<Impl>
         return fault;
     }
 
+  public:
     uint64_t readUniq();
     void setUniq(uint64_t val);
 
diff --git a/cpu/beta_cpu/alpha_full_cpu.hh b/cpu/beta_cpu/alpha_full_cpu.hh
index 0e094b122..e01eba3bf 100644
--- a/cpu/beta_cpu/alpha_full_cpu.hh
+++ b/cpu/beta_cpu/alpha_full_cpu.hh
@@ -29,6 +29,8 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
 #endif
 
   public:
+    void regStats();
+
 #ifdef FULL_SYSTEM
     bool inPalMode();
 
@@ -66,14 +68,17 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
         req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
         return No_Fault;
     }
+
     Fault translateInstReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
+
     Fault translateDataReadReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
     }
+
     Fault translateDataWriteReq(MemReqPtr &req)
     {
         return dummyTranslation(req);
@@ -81,73 +86,6 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
 
 #endif
 
-    template <class T>
-    Fault read(MemReqPtr &req, T &data)
-    {
-#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
-        if (req->flags & LOCKED) {
-            MiscRegFile *cregs = &req->xc->regs.miscRegs;
-            cregs->lock_addr = req->paddr;
-            cregs->lock_flag = true;
-        }
-#endif
-
-        Fault error;
-        error = mem->read(req, data);
-        data = htoa(data);
-        return error;
-    }
-
-    template <class T>
-    Fault write(MemReqPtr &req, T &data)
-    {
-#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
-
-        MiscRegFile *cregs;
-
-        // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
-            cregs = &xc->regs.miscRegs;
-
-            if (req->flags & UNCACHEABLE) {
-                // Don't update result register (see stq_c in isa_desc)
-                req->result = 2;
-                req->xc->storeCondFailures = 0;//Needed? [RGD]
-            } else {
-                req->result = cregs->lock_flag;
-                if (!cregs->lock_flag ||
-                    ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) {
-                    cregs->lock_flag = false;
-                    if (((++req->xc->storeCondFailures) % 100000) == 0) {
-                        std::cerr << "Warning: "
-                                  << req->xc->storeCondFailures
-                                  << " consecutive store conditional failures "
-                                  << "on cpu " << cpu_id
-                                  << std::endl;
-                    }
-                    return No_Fault;
-                }
-                else req->xc->storeCondFailures = 0;
-            }
-        }
-
-        // Need to clear any locked flags on other proccessors for
-        // this address.  Only do this for succsful Store Conditionals
-        // and all other stores (WH64?).  Unsuccessful Store
-        // Conditionals would have returned above, and wouldn't fall
-        // through.
-        for (int i = 0; i < system->execContexts.size(); i++){
-            cregs = &system->execContexts[i]->regs.miscRegs;
-            if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) {
-                cregs->lock_flag = false;
-            }
-        }
-
-#endif
-
-        return mem->write(req, (T)htoa(data));
-    }
-
     // Later on may want to remove this misc stuff from the regfile and
     // have it handled at this level.  Might prove to be an issue when
     // trying to rename source/destination registers...
@@ -240,6 +178,76 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
     // Called by initCPU.  Implement as I please.
     void initIPRs(RegFile *regs);
 #endif
+
+
+    template <class T>
+    Fault read(MemReqPtr &req, T &data)
+    {
+#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
+        if (req->flags & LOCKED) {
+            MiscRegFile *cregs = &req->xc->regs.miscRegs;
+            cregs->lock_addr = req->paddr;
+            cregs->lock_flag = true;
+        }
+#endif
+
+        Fault error;
+        error = mem->read(req, data);
+        data = htoa(data);
+        return error;
+    }
+
+
+    template <class T>
+    Fault write(MemReqPtr &req, T &data)
+    {
+#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
+
+        MiscRegFile *cregs;
+
+        // If this is a store conditional, act appropriately
+        if (req->flags & LOCKED) {
+            cregs = &xc->regs.miscRegs;
+
+            if (req->flags & UNCACHEABLE) {
+                // Don't update result register (see stq_c in isa_desc)
+                req->result = 2;
+                req->xc->storeCondFailures = 0;//Needed? [RGD]
+            } else {
+                req->result = cregs->lock_flag;
+                if (!cregs->lock_flag ||
+                    ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) {
+                    cregs->lock_flag = false;
+                    if (((++req->xc->storeCondFailures) % 100000) == 0) {
+                        std::cerr << "Warning: "
+                                  << req->xc->storeCondFailures
+                                  << " consecutive store conditional failures "
+                                  << "on cpu " << cpu_id
+                                  << std::endl;
+                    }
+                    return No_Fault;
+                }
+                else req->xc->storeCondFailures = 0;
+            }
+        }
+
+        // Need to clear any locked flags on other proccessors for
+        // this address.  Only do this for succsful Store Conditionals
+        // and all other stores (WH64?).  Unsuccessful Store
+        // Conditionals would have returned above, and wouldn't fall
+        // through.
+        for (int i = 0; i < system->execContexts.size(); i++){
+            cregs = &system->execContexts[i]->regs.miscRegs;
+            if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) {
+                cregs->lock_flag = false;
+            }
+        }
+
+#endif
+
+        return mem->write(req, (T)htoa(data));
+    }
+
 };
 
 #endif // __ALPHA_FULL_CPU_HH__
diff --git a/cpu/beta_cpu/alpha_full_cpu_builder.cc b/cpu/beta_cpu/alpha_full_cpu_builder.cc
index 5fe96d656..f37081232 100644
--- a/cpu/beta_cpu/alpha_full_cpu_builder.cc
+++ b/cpu/beta_cpu/alpha_full_cpu_builder.cc
@@ -81,17 +81,38 @@ Param<unsigned> issueWidth;
 Param<unsigned> executeWidth;
 Param<unsigned> executeIntWidth;
 Param<unsigned> executeFloatWidth;
+Param<unsigned> executeBranchWidth;
+Param<unsigned> executeMemoryWidth;
 
 Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;
 
+#if 0
 Param<unsigned> localPredictorSize;
 Param<unsigned> localPredictorCtrBits;
+#endif
+Param<unsigned> local_predictor_size;
+Param<unsigned> local_ctr_bits;
+Param<unsigned> local_history_table_size;
+Param<unsigned> local_history_bits;
+Param<unsigned> global_predictor_size;
+Param<unsigned> global_ctr_bits;
+Param<unsigned> global_history_bits;
+Param<unsigned> choice_predictor_size;
+Param<unsigned> choice_ctr_bits;
+
 Param<unsigned> BTBEntries;
 Param<unsigned> BTBTagSize;
 
+Param<unsigned> RASSize;
+
+Param<unsigned> LQEntries;
+Param<unsigned> SQEntries;
+Param<unsigned> LFSTSize;
+Param<unsigned> SSITSize;
+
 Param<unsigned> numPhysIntRegs;
 Param<unsigned> numPhysFloatRegs;
 Param<unsigned> numIQEntries;
@@ -168,6 +189,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
     INIT_PARAM(executeWidth, "Execute width"),
     INIT_PARAM(executeIntWidth, "Integer execute width"),
     INIT_PARAM(executeFloatWidth, "Floating point execute width"),
+    INIT_PARAM(executeBranchWidth, "Branch execute width"),
+    INIT_PARAM(executeMemoryWidth, "Memory execute width"),
 
     INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
                "delay"),
@@ -175,12 +198,30 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
     INIT_PARAM(commitWidth, "Commit width"),
     INIT_PARAM(squashWidth, "Squash width"),
 
+#if 0
     INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. "
                "Must be a power of 2."),
     INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"),
+#endif
+    INIT_PARAM(local_predictor_size, "Size of local predictor"),
+    INIT_PARAM(local_ctr_bits, "Bits per counter"),
+    INIT_PARAM(local_history_table_size, "Size of local history table"),
+    INIT_PARAM(local_history_bits, "Bits for the local history"),
+    INIT_PARAM(global_predictor_size, "Size of global predictor"),
+    INIT_PARAM(global_ctr_bits, "Bits per counter"),
+    INIT_PARAM(global_history_bits, "Bits of history"),
+    INIT_PARAM(choice_predictor_size, "Size of choice predictor"),
+    INIT_PARAM(choice_ctr_bits, "Bits of choice counters"),
+
     INIT_PARAM(BTBEntries, "Number of BTB entries"),
     INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
 
+    INIT_PARAM(RASSize, "RAS size"),
+
+    INIT_PARAM(LQEntries, "Number of load queue entries"),
+    INIT_PARAM(SQEntries, "Number of store queue entries"),
+    INIT_PARAM(LFSTSize, "Last fetched store table size"),
+    INIT_PARAM(SSITSize, "Store set ID table size"),
 
     INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
     INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
@@ -277,17 +318,37 @@ CREATE_SIM_OBJECT(BaseFullCPU)
     params.executeWidth = executeWidth;
     params.executeIntWidth = executeIntWidth;
     params.executeFloatWidth = executeFloatWidth;
+    params.executeBranchWidth = executeBranchWidth;
+    params.executeMemoryWidth = executeMemoryWidth;
 
     params.iewToCommitDelay = iewToCommitDelay;
     params.renameToROBDelay = renameToROBDelay;
     params.commitWidth = commitWidth;
     params.squashWidth = squashWidth;
-
+#if 0
     params.localPredictorSize = localPredictorSize;
     params.localPredictorCtrBits = localPredictorCtrBits;
+#endif
+    params.local_predictor_size = local_predictor_size;
+    params.local_ctr_bits = local_ctr_bits;
+    params.local_history_table_size = local_history_table_size;
+    params.local_history_bits = local_history_bits;
+    params.global_predictor_size = global_predictor_size;
+    params.global_ctr_bits = global_ctr_bits;
+    params.global_history_bits = global_history_bits;
+    params.choice_predictor_size = choice_predictor_size;
+    params.choice_ctr_bits = choice_ctr_bits;
+
     params.BTBEntries = BTBEntries;
     params.BTBTagSize = BTBTagSize;
 
+    params.RASSize = RASSize;
+
+    params.LQEntries = LQEntries;
+    params.SQEntries = SQEntries;
+    params.SSITSize = SSITSize;
+    params.LFSTSize = LFSTSize;
+
     params.numPhysIntRegs = numPhysIntRegs;
     params.numPhysFloatRegs = numPhysFloatRegs;
     params.numIQEntries = numIQEntries;
diff --git a/cpu/beta_cpu/alpha_full_cpu_impl.hh b/cpu/beta_cpu/alpha_full_cpu_impl.hh
index 8bfc0777e..ee8f9f33b 100644
--- a/cpu/beta_cpu/alpha_full_cpu_impl.hh
+++ b/cpu/beta_cpu/alpha_full_cpu_impl.hh
@@ -27,6 +27,19 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
     rob.setCPU(this);
 }
 
+template <class Impl>
+void
+AlphaFullCPU<Impl>::regStats()
+{
+    // Register stats for everything that has stats.
+    fullCPURegStats();
+    fetch.regStats();
+    decode.regStats();
+    rename.regStats();
+    iew.regStats();
+    commit.regStats();
+}
+
 #ifndef FULL_SYSTEM
 
 template <class Impl>
@@ -92,6 +105,14 @@ AlphaFullCPU<Impl>::squashStages()
 
     rob.squash(rob_head);
     commit.setSquashing();
+
+    // Now hack the time buffer to clear the sequence numbers in the places
+    // where the stages might read it.?
+    for (int i = 0; i < 5; ++i)
+    {
+        timeBuffer.access(-i)->commitInfo.doneSeqNum = 0;
+    }
+
 }
 
 #endif // FULL_SYSTEM
@@ -178,7 +199,7 @@ template <class Impl>
 uint64_t *
 AlphaFullCPU<Impl>::getIpr()
 {
-    return regs.ipr;
+    return regFile.getIpr();
 }
 
 template <class Impl>
@@ -564,7 +585,7 @@ AlphaFullCPU<Impl>::setIntrFlag(int val)
     regs.intrflag = val;
 }
 
-// Maybe have this send back from IEW stage to squash and update PC.
+// Can force commit stage to squash and stuff.
 template <class Impl>
 Fault
 AlphaFullCPU<Impl>::hwrei()
diff --git a/cpu/beta_cpu/alpha_params.hh b/cpu/beta_cpu/alpha_params.hh
index 92dfd35f5..ecde4b016 100644
--- a/cpu/beta_cpu/alpha_params.hh
+++ b/cpu/beta_cpu/alpha_params.hh
@@ -72,6 +72,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned executeWidth;
     unsigned executeIntWidth;
     unsigned executeFloatWidth;
+    unsigned executeBranchWidth;
+    unsigned executeMemoryWidth;
 
     //
     // Commit
@@ -84,11 +86,26 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     //
     // Branch predictor (BP & BTB)
     //
+/*
     unsigned localPredictorSize;
     unsigned localPredictorCtrBits;
+*/
+
+    unsigned local_predictor_size;
+    unsigned local_ctr_bits;
+    unsigned local_history_table_size;
+    unsigned local_history_bits;
+    unsigned global_predictor_size;
+    unsigned global_ctr_bits;
+    unsigned global_history_bits;
+    unsigned choice_predictor_size;
+    unsigned choice_ctr_bits;
+
     unsigned BTBEntries;
     unsigned BTBTagSize;
 
+    unsigned RASSize;
+
     //
     // Load store queue
     //
@@ -96,6 +113,12 @@ class AlphaSimpleParams : public BaseFullCPU::Params
     unsigned SQEntries;
 
     //
+    // Memory dependence
+    //
+    unsigned SSITSize;
+    unsigned LFSTSize;
+
+    //
     // Miscellaneous
     //
     unsigned numPhysIntRegs;
diff --git a/cpu/beta_cpu/bpred_unit.cc b/cpu/beta_cpu/bpred_unit.cc
index 6de2def44..c4a79fbbe 100644
--- a/cpu/beta_cpu/bpred_unit.cc
+++ b/cpu/beta_cpu/bpred_unit.cc
@@ -1,5 +1,6 @@
 
 #include "cpu/beta_cpu/bpred_unit_impl.hh"
 #include "cpu/beta_cpu/alpha_impl.hh"
+#include "cpu/beta_cpu/alpha_dyn_inst.hh"
 
-template DefaultBPredUnit<AlphaSimpleImpl>;
+template TwobitBPredUnit<AlphaSimpleImpl>;
diff --git a/cpu/beta_cpu/bpred_unit.hh b/cpu/beta_cpu/bpred_unit.hh
index 71191f5b7..53c7146c5 100644
--- a/cpu/beta_cpu/bpred_unit.hh
+++ b/cpu/beta_cpu/bpred_unit.hh
@@ -4,9 +4,15 @@
 
 // For Addr type.
 #include "arch/alpha/isa_traits.hh"
+#include "base/statistics.hh"
+#include "cpu/inst_seq.hh"
 
 #include "cpu/beta_cpu/2bit_local_pred.hh"
+#include "cpu/beta_cpu/tournament_pred.hh"
 #include "cpu/beta_cpu/btb.hh"
+#include "cpu/beta_cpu/ras.hh"
+
+#include <list>
 
 /**
  * Basically a wrapper class to hold both the branch predictor
@@ -18,34 +24,86 @@
  * object, and be able to call the constructors on the BP and BTB.
  */
 template<class Impl>
-class DefaultBPredUnit
+class TwobitBPredUnit
 {
   public:
     typedef typename Impl::Params Params;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    TwobitBPredUnit(Params &params);
+
+    void regStats();
+
+    bool predict(DynInstPtr &inst, Addr &PC);
+
+    void squash(const InstSeqNum &squashed_sn, const Addr &corr_target,
+                bool actually_taken);
 
-    DefaultBPredUnit(Params &params);
+    void squash(const InstSeqNum &squashed_sn);
+
+    void update(const InstSeqNum &done_sn);
 
     bool BPLookup(Addr &inst_PC)
     { return BP.lookup(inst_PC); }
 
+    unsigned BPReadGlobalHist()
+    { return 0; }
+
     bool BTBValid(Addr &inst_PC)
     { return BTB.valid(inst_PC); }
 
     Addr BTBLookup(Addr &inst_PC)
     { return BTB.lookup(inst_PC); }
 
-    void BPUpdate(Addr &inst_PC, bool taken)
+    // Will want to include global history.
+    void BPUpdate(Addr &inst_PC, unsigned global_history, bool taken)
     { BP.update(inst_PC, taken); }
 
     void BTBUpdate(Addr &inst_PC, Addr &target_PC)
     { BTB.update(inst_PC, target_PC); }
 
   private:
+    struct PredictorHistory {
+        PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC,
+                         const bool pred_taken)
+            : seqNum(seq_num), PC(inst_PC), predTaken(pred_taken),
+              globalHistory(0), usedRAS(0), wasCall(0), RASIndex(0),
+              RASTarget(0)
+        { }
+
+        InstSeqNum seqNum;
+
+        Addr PC;
+
+        bool predTaken;
+
+        unsigned globalHistory;
+
+        bool usedRAS;
+
+        bool wasCall;
+
+        unsigned RASIndex;
+
+        Addr RASTarget;
+    };
+
+    std::list<PredictorHistory> predHist;
 
     DefaultBP BP;
 
     DefaultBTB BTB;
 
+    ReturnAddrStack RAS;
+
+    Stats::Scalar<> lookups;
+    Stats::Scalar<> condPredicted;
+    Stats::Scalar<> condIncorrect;
+    Stats::Scalar<> BTBLookups;
+    Stats::Scalar<> BTBHits;
+    Stats::Scalar<> BTBCorrect;
+    Stats::Scalar<> usedRAS;
+    Stats::Scalar<> RASIncorrect;
 };
 
 #endif // __BPRED_UNIT_HH__
diff --git a/cpu/beta_cpu/bpred_unit_impl.hh b/cpu/beta_cpu/bpred_unit_impl.hh
index 47415ce9b..02c613d34 100644
--- a/cpu/beta_cpu/bpred_unit_impl.hh
+++ b/cpu/beta_cpu/bpred_unit_impl.hh
@@ -1,13 +1,247 @@
 
 #include "cpu/beta_cpu/bpred_unit.hh"
+#include "base/traceflags.hh"
+#include "base/trace.hh"
 
 template<class Impl>
-DefaultBPredUnit<Impl>::DefaultBPredUnit(Params &params)
-  : BP(params.localPredictorSize,
-       params.localPredictorCtrBits,
+TwobitBPredUnit<Impl>::TwobitBPredUnit(Params &params)
+  : BP(params.local_predictor_size,
+       params.local_ctr_bits,
        params.instShiftAmt),
     BTB(params.BTBEntries,
         params.BTBTagSize,
-        params.instShiftAmt)
+        params.instShiftAmt),
+    RAS(params.RASSize)
 {
 }
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::regStats()
+{
+    lookups
+        .name(name() + ".BPredUnit.lookups")
+        .desc("Number of BP lookups")
+        ;
+
+    condPredicted
+        .name(name() + ".BPredUnit.condPredicted")
+        .desc("Number of conditional branches predicted")
+        ;
+
+    condIncorrect
+        .name(name() + ".BPredUnit.condIncorrect")
+        .desc("Number of conditional branches incorrect")
+        ;
+
+    BTBLookups
+        .name(name() + ".BPredUnit.BTBLookups")
+        .desc("Number of BTB lookups")
+        ;
+
+    BTBHits
+        .name(name() + ".BPredUnit.BTBHits")
+        .desc("Number of BTB hits")
+        ;
+
+    BTBCorrect
+        .name(name() + ".BPredUnit.BTBCorrect")
+        .desc("Number of correct BTB predictions (this stat may not "
+              "work properly.")
+        ;
+
+    usedRAS
+        .name(name() + ".BPredUnit.usedRAS")
+        .desc("Number of times the RAS was used.")
+        ;
+
+    RASIncorrect
+        .name(name() + ".BPredUnit.RASInCorrect")
+        .desc("Number of incorrect RAS predictions.")
+        ;
+}
+
+template <class Impl>
+bool
+TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
+{
+    // See if branch predictor predicts taken.
+    // If so, get its target addr either from the BTB or the RAS.
+    // Once that's done, speculatively update the predictor?
+    // Save off record of branch stuff so the RAS can be fixed
+    // up once it's done.
+
+    bool pred_taken = false;
+    Addr target;
+
+    ++lookups;
+
+    if (inst->isUncondCtrl()) {
+        DPRINTF(Fetch, "BranchPred: Unconditional control.\n");
+        pred_taken = true;
+    } else {
+        ++condPredicted;
+
+        pred_taken = BPLookup(PC);
+
+        DPRINTF(Fetch, "BranchPred: Branch predictor predicted %i for PC %#x"
+                "\n", pred_taken, inst->readPC());
+    }
+
+    PredictorHistory predict_record(inst->seqNum, PC, pred_taken);
+
+    // Now lookup in the BTB or RAS.
+    if (pred_taken) {
+        if (inst->isReturn()) {
+            ++usedRAS;
+
+            // If it's a function return call, then look up the address
+            // in the RAS.
+            target = RAS.top();
+
+            // Record the top entry of the RAS, and its index.
+            predict_record.usedRAS = true;
+            predict_record.RASIndex = RAS.topIdx();
+            predict_record.RASTarget = target;
+
+            RAS.pop();
+
+            DPRINTF(Fetch, "BranchPred: Instruction %#x is a return, RAS "
+                    "predicted target: %#x, RAS index: %i.\n",
+                    inst->readPC(), target, predict_record.RASIndex);
+        } else {
+            ++BTBLookups;
+
+            if (inst->isCall()) {
+                RAS.push(PC+sizeof(MachInst));
+
+                // Record that it was a call so that the top RAS entry can
+                // be popped off if the speculation is incorrect.
+                predict_record.wasCall = true;
+
+                DPRINTF(Fetch, "BranchPred: Instruction %#x was a call, "
+                        "adding %#x to the RAS.\n",
+                        inst->readPC(), PC+sizeof(MachInst));
+            }
+
+            if (BTB.valid(PC)) {
+                ++BTBHits;
+
+                //If it's anything else, use the BTB to get the target addr.
+                target = BTB.lookup(PC);
+
+                DPRINTF(Fetch, "BranchPred: Instruction %#x predicted target "
+                        "is %#x.\n", inst->readPC(), target);
+
+            } else {
+                DPRINTF(Fetch, "BranchPred: BTB doesn't have a valid entry."
+                        "\n");
+                pred_taken = false;
+            }
+
+        }
+    }
+
+    if (pred_taken) {
+        // Set the PC and the instruction's predicted target.
+        PC = target;
+        inst->setPredTarg(target);
+    } else {
+        PC = PC + sizeof(MachInst);
+        inst->setPredTarg(PC);
+    }
+
+    predHist.push_front(predict_record);
+
+    assert(!predHist.empty());
+
+    return pred_taken;
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn)
+{
+    DPRINTF(Fetch, "BranchPred: Commiting branches until sequence number "
+            "%i.\n", done_sn);
+
+    while (!predHist.empty() && predHist.back().seqNum <= done_sn) {
+        assert(!predHist.empty());
+
+        // Update the branch predictor with the correct results of branches.
+        BP.update(predHist.back().PC, predHist.back().predTaken);
+
+        predHist.pop_back();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn)
+{
+    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
+        if (predHist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+                    "target: %#x.\n",
+                    predHist.front().RASIndex,
+                    predHist.front().RASTarget);
+
+            RAS.restore(predHist.front().RASIndex,
+                        predHist.front().RASTarget);
+        } else if (predHist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
+                    "to the RAS.\n");
+
+            RAS.pop();
+        }
+
+        predHist.pop_front();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
+                              const Addr &corr_target,
+                              const bool actually_taken)
+{
+    // Now that we know that a branch was mispredicted, we need to undo
+    // all the branches that have been seen up until this branch and
+    // fix up everything.
+
+    ++condIncorrect;
+
+    DPRINTF(Fetch, "BranchPred: Squashing from sequence number %i, "
+            "setting target to %#x.\n",
+            squashed_sn, corr_target);
+
+    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
+
+        if (predHist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+                    "target: %#x.\n",
+                    predHist.front().RASIndex,
+                    predHist.front().RASTarget);
+
+            RAS.restore(predHist.front().RASIndex,
+                        predHist.front().RASTarget);
+        } else if (predHist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
+                    "to the RAS.\n");
+
+            RAS.pop();
+        }
+
+        predHist.pop_front();
+    }
+
+    predHist.front().predTaken = actually_taken;
+
+    if (predHist.front().usedRAS) {
+        ++RASIncorrect;
+    }
+
+    BP.update(predHist.front().PC, actually_taken);
+
+    BTB.update(predHist.front().PC, corr_target);
+}
diff --git a/cpu/beta_cpu/btb.cc b/cpu/beta_cpu/btb.cc
index b49f30482..bceaa66d1 100644
--- a/cpu/beta_cpu/btb.cc
+++ b/cpu/beta_cpu/btb.cc
@@ -50,6 +50,8 @@ DefaultBTB::valid(const Addr &inst_PC)
 
     Addr inst_tag = getTag(inst_PC);
 
+    assert(btb_idx < numEntries);
+
     if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
         return true;
     } else {
@@ -67,6 +69,8 @@ DefaultBTB::lookup(const Addr &inst_PC)
 
     Addr inst_tag = getTag(inst_PC);
 
+    assert(btb_idx < numEntries);
+
     if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
         return btb[btb_idx].target;
     } else {
@@ -79,6 +83,8 @@ DefaultBTB::update(const Addr &inst_PC, const Addr &target)
 {
     unsigned btb_idx = getIndex(inst_PC);
 
+    assert(btb_idx < numEntries);
+
     btb[btb_idx].valid = true;
     btb[btb_idx].target = target;
     btb[btb_idx].tag = getTag(inst_PC);
diff --git a/cpu/beta_cpu/comm.hh b/cpu/beta_cpu/comm.hh
index 849a6c797..e327a83b9 100644
--- a/cpu/beta_cpu/comm.hh
+++ b/cpu/beta_cpu/comm.hh
@@ -9,6 +9,7 @@
 using namespace std;
 
 // Find better place to put this typedef.
+// The impl might be the best place for this.
 typedef short int PhysRegIndex;
 
 template<class Impl>
@@ -45,6 +46,14 @@ struct SimpleIEWSimpleCommit {
     int size;
 
     DynInstPtr insts[Impl::MaxWidth + 1];
+
+    bool squash;
+    bool branchMispredict;
+    bool branchTaken;
+    uint64_t mispredPC;
+    uint64_t nextPC;
+    unsigned globalHist;
+    InstSeqNum squashedSeqNum;
 };
 
 template<class Impl>
@@ -63,10 +72,15 @@ struct TimeBufStruct {
         bool predIncorrect;
         uint64_t branchAddr;
 
+        InstSeqNum doneSeqNum;
+
+        // Might want to package this kind of branch stuff into a single
+        // struct as it is used pretty frequently.
         bool branchMispredict;
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
+        unsigned globalHist;
     };
 
     decodeComm decodeInfo;
@@ -84,17 +98,10 @@ struct TimeBufStruct {
     renameComm renameInfo;
 
     struct iewComm {
-        bool squash;
         bool stall;
 
         // Also eventually include skid buffer space.
         unsigned freeIQEntries;
-
-        bool branchMispredict;
-        bool branchTaken;
-        uint64_t mispredPC;
-        uint64_t nextPC;
-        InstSeqNum squashedSeqNum;
     };
 
     iewComm iewInfo;
@@ -108,6 +115,7 @@ struct TimeBufStruct {
         bool branchTaken;
         uint64_t mispredPC;
         uint64_t nextPC;
+        unsigned globalHist;
 
         // Think of better names here.
         // Will need to be a variety of sizes...
diff --git a/cpu/beta_cpu/commit.hh b/cpu/beta_cpu/commit.hh
index 981d9e78f..f1a185143 100644
--- a/cpu/beta_cpu/commit.hh
+++ b/cpu/beta_cpu/commit.hh
@@ -59,6 +59,8 @@ class SimpleCommit
   public:
     SimpleCommit(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu_ptr);
 
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@@ -142,6 +144,17 @@ class SimpleCommit
 
     /** Commit width, in instructions. */
     unsigned commitWidth;
+
+    Stats::Scalar<> commitCommittedInsts;
+    Stats::Scalar<> commitSquashedInsts;
+    Stats::Scalar<> commitSquashEvents;
+    Stats::Scalar<> commitNonSpecStalls;
+    Stats::Scalar<> commitCommittedBranches;
+    Stats::Scalar<> commitCommittedLoads;
+    Stats::Scalar<> commitCommittedMemRefs;
+    Stats::Scalar<> branchMispredicts;
+
+    Stats::Distribution<> n_committed_dist;
 };
 
 #endif // __SIMPLE_COMMIT_HH__
diff --git a/cpu/beta_cpu/commit_impl.hh b/cpu/beta_cpu/commit_impl.hh
index 45b8bc7de..9a69c9259 100644
--- a/cpu/beta_cpu/commit_impl.hh
+++ b/cpu/beta_cpu/commit_impl.hh
@@ -23,6 +23,51 @@ SimpleCommit<Impl>::SimpleCommit(Params &params)
 
 template <class Impl>
 void
+SimpleCommit<Impl>::regStats()
+{
+    commitCommittedInsts
+        .name(name() + ".commitCommittedInsts")
+        .desc("The number of committed instructions")
+        .prereq(commitCommittedInsts);
+    commitSquashedInsts
+        .name(name() + ".commitSquashedInsts")
+        .desc("The number of squashed insts skipped by commit")
+        .prereq(commitSquashedInsts);
+    commitSquashEvents
+        .name(name() + ".commitSquashEvents")
+        .desc("The number of times commit is told to squash")
+        .prereq(commitSquashEvents);
+    commitNonSpecStalls
+        .name(name() + ".commitNonSpecStalls")
+        .desc("The number of times commit has been forced to stall to "
+              "communicate backwards")
+        .prereq(commitNonSpecStalls);
+    commitCommittedBranches
+        .name(name() + ".commitCommittedBranches")
+        .desc("The number of committed branches")
+        .prereq(commitCommittedBranches);
+    commitCommittedLoads
+        .name(name() + ".commitCommittedLoads")
+        .desc("The number of committed loads")
+        .prereq(commitCommittedLoads);
+    commitCommittedMemRefs
+        .name(name() + ".commitCommittedMemRefs")
+        .desc("The number of committed memory references")
+        .prereq(commitCommittedMemRefs);
+    branchMispredicts
+        .name(name() + ".branchMispredicts")
+        .desc("The number of times a branch was mispredicted")
+        .prereq(branchMispredicts);
+    n_committed_dist
+        .init(0,commitWidth,1)
+        .name(name() + ".COM:committed_per_cycle")
+        .desc("Number of insts commited each cycle")
+        .flags(Stats::pdf)
+        ;
+}
+
+template <class Impl>
+void
 SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     DPRINTF(Commit, "Commit: Setting CPU pointer.\n");
@@ -143,12 +188,12 @@ SimpleCommit<Impl>::commit()
     // Should I also check if the commit stage is telling the ROB to squah?
     // This might be necessary to keep the same timing between the IQ and
     // the ROB...
-    if (robInfoFromIEW->iewInfo.squash) {
+    if (fromIEW->squash) {
         DPRINTF(Commit, "Commit: Squashing instructions in the ROB.\n");
 
         _status = ROBSquashing;
 
-        InstSeqNum squashed_inst = robInfoFromIEW->iewInfo.squashedSeqNum;
+        InstSeqNum squashed_inst = fromIEW->squashedSeqNum;
 
         rob->squash(squashed_inst);
 
@@ -162,15 +207,19 @@ SimpleCommit<Impl>::commit()
         // ROB is in the process of squashing.
         toIEW->commitInfo.robSquashing = true;
 
-        toIEW->commitInfo.branchMispredict =
-            robInfoFromIEW->iewInfo.branchMispredict;
+        toIEW->commitInfo.branchMispredict = fromIEW->branchMispredict;
+
+        toIEW->commitInfo.branchTaken = fromIEW->branchTaken;
+
+        toIEW->commitInfo.nextPC = fromIEW->nextPC;
 
-        toIEW->commitInfo.branchTaken =
-            robInfoFromIEW->iewInfo.branchTaken;
+        toIEW->commitInfo.mispredPC = fromIEW->mispredPC;
 
-        toIEW->commitInfo.nextPC = robInfoFromIEW->iewInfo.nextPC;
+        toIEW->commitInfo.globalHist = fromIEW->globalHist;
 
-        toIEW->commitInfo.mispredPC = robInfoFromIEW->iewInfo.mispredPC;
+        if (toIEW->commitInfo.branchMispredict) {
+            ++branchMispredicts;
+        }
     }
 
     if (_status != ROBSquashing) {
@@ -237,6 +286,8 @@ SimpleCommit<Impl>::commitInsts()
             // inst in the ROB without affecting any other stages.
             rob->retireHead();
 
+            ++commitSquashedInsts;
+
         } else {
             // Increment the total number of non-speculative instructions
             // executed.
@@ -249,7 +300,7 @@ SimpleCommit<Impl>::commitInsts()
             bool commit_success = commitHead(head_inst, num_committed);
 
             // Update what instruction we are looking at if the commit worked.
-            if(commit_success) {
+            if (commit_success) {
                 ++num_committed;
 
                 // Send back which instruction has been committed.
@@ -258,7 +309,11 @@ SimpleCommit<Impl>::commitInsts()
                 // sequence number instead (copy).
                 toIEW->commitInfo.doneSeqNum = head_inst->seqNum;
 
-                cpu->instDone();
+                ++commitCommittedInsts;
+
+                if (!head_inst->isNop()) {
+                    cpu->instDone();
+                }
             } else {
                 break;
             }
@@ -267,6 +322,8 @@ SimpleCommit<Impl>::commitInsts()
         // Update the pointer to read the next instruction in the ROB.
         head_inst = rob->readHeadInst();
     }
+
+    n_committed_dist.sample(num_committed);
 }
 
 template <class Impl>
@@ -276,18 +333,13 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     // Make sure instruction is valid
     assert(head_inst);
 
-    Fault fault = No_Fault;
-
-    // If the head instruction is a store or a load, then execute it
-    // because this simple model does no speculative memory access.
-    // Hopefully this covers all memory references.
-    // Also check if it's nonspeculative.  Or a nop.  Then it will be
-    // executed only when it reaches the head of the ROB.  Actually
-    // executing a nop is a bit overkill...
+    // If the instruction is not executed yet, then it is a non-speculative
+    // or store inst.  Signal backwards that it should be executed.
     if (!head_inst->isExecuted()) {
         // Keep this number correct.  We have not yet actually executed
         // and committed this instruction.
         cpu->funcExeInst--;
+
         if (head_inst->isStore() || head_inst->isNonSpeculative()) {
             DPRINTF(Commit, "Commit: Encountered a store or non-speculative "
                     "instruction at the head of the ROB, PC %#x.\n",
@@ -299,6 +351,8 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
             // it is executed.
             head_inst->clearCanCommit();
 
+            ++commitNonSpecStalls;
+
             return false;
         } else {
             panic("Commit: Trying to commit un-executed instruction "
@@ -306,19 +360,6 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
         }
     }
 
-    // Check if memory access was successful.
-    if (fault != No_Fault) {
-        // Handle data cache miss here.  In the future, set the status
-        // to data cache miss, then exit the stage.  Have an event
-        // that handles commiting the head instruction, then setting
-        // the stage back to running, when the event is run.  (just
-        // make sure that event is commit's run for that cycle)
-        panic("Commit: Load/store instruction failed, not sure what "
-              "to do.\n");
-        // Also will want to clear the instruction's fault after being
-        // handled here so it's not handled again below.
-    }
-
     // Now check if it's one of the special trap or barrier or
     // serializing instructions.
     if (head_inst->isThreadSync()  ||
@@ -335,39 +376,43 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 
     // Check if the instruction caused a fault.  If so, trap.
     if (head_inst->getFault() != No_Fault) {
+        if (!head_inst->isNop()) {
 #ifdef FULL_SYSTEM
-        cpu->trap(fault);
+            cpu->trap(fault);
 #else // !FULL_SYSTEM
-        if (!head_inst->isNop()) {
             panic("fault (%d) detected @ PC %08p", head_inst->getFault(),
                   head_inst->PC);
-        }
 #endif // FULL_SYSTEM
+        }
     }
 
     // Check if we're really ready to commit.  If not then return false.
     // I'm pretty sure all instructions should be able to commit if they've
     // reached this far.  For now leave this in as a check.
     if(!rob->isHeadReady()) {
-        DPRINTF(Commit, "Commit: Unable to commit head instruction!\n");
+        panic("Commit: Unable to commit head instruction!\n");
         return false;
     }
 
     // If it's a branch, then send back branch prediction update info
     // to the fetch stage.
     // This should be handled in the iew stage if a mispredict happens...
-#if 0
+
     if (head_inst->isControl()) {
 
+#if 0
         toIEW->nextPC = head_inst->readPC();
         //Maybe switch over to BTB incorrect.
         toIEW->btbMissed = head_inst->btbMiss();
         toIEW->target = head_inst->nextPC;
         //Maybe also include global history information.
         //This simple version will have no branch prediction however.
-    }
 #endif
 
+        ++commitCommittedBranches;
+    }
+
+
 #if 0
     // Check if the instruction has a destination register.
     // If so add the previous physical register of its logical register's
@@ -383,8 +428,12 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
     // the LDSTQ will already have been told that a store has reached the head
     // of the ROB.  Consider including communication if it's a store as well
     // to keep things orthagonal.
-    if (head_inst->isLoad()) {
-        toIEW->commitInfo.commitIsLoad = true;
+    if (head_inst->isMemRef()) {
+        ++commitCommittedMemRefs;
+        if (head_inst->isLoad()) {
+            toIEW->commitInfo.commitIsLoad = true;
+            ++commitCommittedLoads;
+        }
     }
 
     // Now that the instruction is going to be committed, finalize its
diff --git a/cpu/beta_cpu/cpu_policy.hh b/cpu/beta_cpu/cpu_policy.hh
index ec8460b77..1479eb191 100644
--- a/cpu/beta_cpu/cpu_policy.hh
+++ b/cpu/beta_cpu/cpu_policy.hh
@@ -22,7 +22,7 @@
 template<class Impl>
 struct SimpleCPUPolicy
 {
-    typedef DefaultBPredUnit<Impl> BPredUnit;
+    typedef TwobitBPredUnit<Impl> BPredUnit;
     typedef PhysRegFile<Impl> RegFile;
     typedef SimpleFreeList FreeList;
     typedef SimpleRenameMap RenameMap;
diff --git a/cpu/beta_cpu/decode.hh b/cpu/beta_cpu/decode.hh
index be88a4b36..64e87290e 100644
--- a/cpu/beta_cpu/decode.hh
+++ b/cpu/beta_cpu/decode.hh
@@ -49,6 +49,8 @@ class SimpleDecode
   public:
     SimpleDecode(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu_ptr);
 
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@@ -128,6 +130,15 @@ class SimpleDecode
      *  group of instructions, it can restart at the proper instruction.
      */
     unsigned numInst;
+
+    Stats::Scalar<> decodeIdleCycles;
+    Stats::Scalar<> decodeBlockedCycles;
+    Stats::Scalar<> decodeUnblockCycles;
+    Stats::Scalar<> decodeSquashCycles;
+    Stats::Scalar<> decodeBranchMispred;
+    Stats::Scalar<> decodeControlMispred;
+    Stats::Scalar<> decodeDecodedInsts;
+    Stats::Scalar<> decodeSquashedInsts;
 };
 
 #endif // __SIMPLE_DECODE_HH__
diff --git a/cpu/beta_cpu/decode_impl.hh b/cpu/beta_cpu/decode_impl.hh
index d0f46eaa5..8b20bf8bc 100644
--- a/cpu/beta_cpu/decode_impl.hh
+++ b/cpu/beta_cpu/decode_impl.hh
@@ -16,6 +16,45 @@ SimpleDecode<Impl>::SimpleDecode(Params &params)
     _status = Idle;
 }
 
+template <class Impl>
+void
+SimpleDecode<Impl>::regStats()
+{
+    decodeIdleCycles
+        .name(name() + ".decodeIdleCycles")
+        .desc("Number of cycles decode is idle")
+        .prereq(decodeIdleCycles);
+    decodeBlockedCycles
+        .name(name() + ".decodeBlockedCycles")
+        .desc("Number of cycles decode is blocked")
+        .prereq(decodeBlockedCycles);
+    decodeUnblockCycles
+        .name(name() + ".decodeUnblockCycles")
+        .desc("Number of cycles decode is unblocking")
+        .prereq(decodeUnblockCycles);
+    decodeSquashCycles
+        .name(name() + ".decodeSquashCycles")
+        .desc("Number of cycles decode is squashing")
+        .prereq(decodeSquashCycles);
+    decodeBranchMispred
+        .name(name() + ".decodeBranchMispred")
+        .desc("Number of times decode detected a branch misprediction")
+        .prereq(decodeBranchMispred);
+    decodeControlMispred
+        .name(name() + ".decodeControlMispred")
+        .desc("Number of times decode detected an instruction incorrectly"
+              " predicted as a control")
+        .prereq(decodeControlMispred);
+    decodeDecodedInsts
+        .name(name() + ".decodeDecodedInsts")
+        .desc("Number of instructions handled by decode")
+        .prereq(decodeDecodedInsts);
+    decodeSquashedInsts
+        .name(name() + ".decodeSquashedInsts")
+        .desc("Number of squashed instructions handled by decode")
+        .prereq(decodeSquashedInsts);
+}
+
 template<class Impl>
 void
 SimpleDecode<Impl>::setCPU(FullCPU *cpu_ptr)
@@ -91,7 +130,7 @@ SimpleDecode<Impl>::unblock()
 
     // If there's still information in the skid buffer, then
     // continue to tell previous stages to stall.  They will be
-            // able to restart once the skid buffer is empty.
+    // able to restart once the skid buffer is empty.
     if (!skidBuffer.empty()) {
         toFetch->decodeInfo.stall = true;
     } else {
@@ -110,9 +149,12 @@ SimpleDecode<Impl>::squash(DynInstPtr &inst)
                     "detected at decode.\n");
     Addr new_PC = inst->nextPC;
 
+    toFetch->decodeInfo.branchMispredict = true;
+    toFetch->decodeInfo.doneSeqNum = inst->seqNum;
     toFetch->decodeInfo.predIncorrect = true;
     toFetch->decodeInfo.squash = true;
     toFetch->decodeInfo.nextPC = new_PC;
+    toFetch->decodeInfo.branchTaken = true;
 
     // Set status to squashing.
     _status = Squashing;
@@ -164,6 +206,8 @@ SimpleDecode<Impl>::tick()
         // buffer were used.  Remove those instructions and handle
         // the rest of unblocking.
         if (_status == Unblocking) {
+            ++decodeUnblockCycles;
+
             if (fromFetch->size > 0) {
                 // Add the current inputs to the skid buffer so they can be
                 // reprocessed when this stage unblocks.
@@ -173,6 +217,8 @@ SimpleDecode<Impl>::tick()
             unblock();
         }
     } else if (_status == Blocked) {
+        ++decodeBlockedCycles;
+
         if (fromFetch->size > 0) {
             block();
         }
@@ -197,6 +243,8 @@ SimpleDecode<Impl>::tick()
             squash();
         }
     } else if (_status == Squashing) {
+        ++decodeSquashCycles;
+
         if (!fromCommit->commitInfo.squash &&
             !fromCommit->commitInfo.robSquashing) {
             _status = Running;
@@ -228,17 +276,16 @@ SimpleDecode<Impl>::decode()
     // Check fetch queue to see if instructions are available.
     // If no available instructions, do nothing, unless this stage is
     // currently unblocking.
-    if (!fromFetch->insts[0] && _status != Unblocking) {
+    if (fromFetch->size == 0 && _status != Unblocking) {
         DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n");
         // Should I change the status to idle?
+        ++decodeIdleCycles;
         return;
     }
 
+    // Might be better to use a base DynInst * instead?
     DynInstPtr inst;
 
-    // Instead have a class member variable that records which instruction
-    // was the last one that was ended on.  At the tick() stage, it can
-    // check if that's equal to 0.  If not, then don't pop stuff off.
     unsigned to_rename_index = 0;
 
     int insts_available = _status == Unblocking ?
@@ -264,18 +311,10 @@ SimpleDecode<Impl>::decode()
     }
 #endif
 
-    // Check to make sure that instructions coming from fetch are valid.
-    // Normally at this stage the branch target of PC-relative branches
-    // should be computed here.  However in this simple model all
-    // computation will take place at execute.  Hence doneTargCalc()
-    // will always be false.
      while (insts_available > 0)
      {
         DPRINTF(Decode, "Decode: Sending instruction to rename.\n");
-        // Might create some sort of accessor to get an instruction
-        // on a per thread basis.  Or might be faster to just get
-        // a pointer to an array or list of instructions and use that
-        // within this code.
+
         inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
                fromFetch->insts[numInst];
 
@@ -287,6 +326,8 @@ SimpleDecode<Impl>::decode()
                     "squashed, skipping.\n",
                     inst->seqNum, inst->readPC());
 
+            ++decodeSquashedInsts;
+
             ++numInst;
             --insts_available;
 
@@ -305,16 +346,22 @@ SimpleDecode<Impl>::decode()
         if (inst->predTaken() && !inst->isControl()) {
             panic("Instruction predicted as a branch!");
 
+            ++decodeControlMispred;
             // Might want to set some sort of boolean and just do
             // a check at the end
             squash(inst);
             break;
         }
 
-        // Ensure that the predicted branch target is the actual branch
-        // target if possible (branches that are PC relative).
-        if (inst->isControl() && inst->doneTargCalc()) {
+        // Go ahead and compute any PC-relative branches.
+
+        if (inst->isDirectCtrl() && inst->isUncondCtrl() &&
+            inst->numDestRegs() == 0 && inst->numSrcRegs() == 0) {
+            inst->execute();
+            inst->setExecuted();
+
             if (inst->mispredicted()) {
+                ++decodeBranchMispred;
                 // Might want to set some sort of boolean and just do
                 // a check at the end
                 squash(inst);
@@ -322,6 +369,11 @@ SimpleDecode<Impl>::decode()
             }
         }
 
+        // Normally can check if a direct branch has the right target
+        // addr (either the immediate, or the branch PC + 4) and redirect
+        // fetch if it's incorrect.
+
+
         // Also check if instructions have no source registers.  Mark
         // them as ready to issue at any time.  Not sure if this check
         // should exist here or at a later stage; however it doesn't matter
@@ -334,6 +386,7 @@ SimpleDecode<Impl>::decode()
         // Increment which instruction we're looking at.
         ++numInst;
         ++to_rename_index;
+        ++decodeDecodedInsts;
 
         --insts_available;
     }
diff --git a/cpu/beta_cpu/fetch.hh b/cpu/beta_cpu/fetch.hh
index e59a9df7f..4cfc2f167 100644
--- a/cpu/beta_cpu/fetch.hh
+++ b/cpu/beta_cpu/fetch.hh
@@ -14,6 +14,7 @@
 #include "sim/eventq.hh"
 #include "cpu/pc_event.hh"
 #include "mem/mem_interface.hh"
+#include "base/statistics.hh"
 
 /**
  * SimpleFetch class to fetch a single instruction each cycle.  SimpleFetch
@@ -59,6 +60,8 @@ class SimpleFetch
     /** SimpleFetch constructor. */
     SimpleFetch(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu_ptr);
 
     void setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer);
@@ -73,9 +76,13 @@ class SimpleFetch
 
 //  private:
     // Figure out PC vs next PC and how it should be updated
-    void squash(Addr newPC);
+    void squash(const Addr &new_PC);
 
   private:
+    inline void doSquash(const Addr &new_PC);
+
+    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num);
+
     /**
      * Looks up in the branch predictor to see if the next PC should be
      * either next PC+=MachInst or a branch target.
@@ -84,7 +91,27 @@ class SimpleFetch
      * the next PC will be.
      * @return Whether or not a branch was predicted as taken.
      */
-    bool lookupAndUpdateNextPC(Addr &next_PC);
+    bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC);
+
+    // Might not want this function...
+//    inline void recordGlobalHist(DynInstPtr &inst);
+
+    /**
+     * Fetches the cache line that contains fetch_PC.  Returns any
+     * fault that happened.  Puts the data into the class variable
+     * cacheData.
+     * @params fetch_PC The PC address that is being fetched from.
+     * @return Any fault that occured.
+     */
+    Fault fetchCacheLine(Addr fetch_PC);
+
+    // Align an address (typically a PC) to the start of an I-cache block.
+    // We fold in the PISA 64- to 32-bit conversion here as well.
+    Addr icacheBlockAlignPC(Addr addr)
+    {
+        addr = ISA::realPCToFetchPC(addr);
+        return (addr & ~(cacheBlkMask));
+    }
 
   public:
     class CacheCompletionEvent : public Event
@@ -99,7 +126,7 @@ class SimpleFetch
         virtual const char *description();
     };
 
-    CacheCompletionEvent cacheCompletionEvent;
+//    CacheCompletionEvent cacheCompletionEvent;
 
   private:
     /** Pointer to the FullCPU. */
@@ -152,20 +179,32 @@ class SimpleFetch
     unsigned fetchWidth;
 
     /** Cache block size. */
-    int blkSize;
+    int cacheBlkSize;
 
     /** Mask to get a cache block's address. */
-    Addr cacheBlockMask;
+    Addr cacheBlkMask;
 
     /** The instruction being fetched. */
-    MachInst inst;
+//    MachInst inst;
+
+    /** The cache line being fetched. */
+    uint8_t *cacheData;
 
     /** Size of instructions. */
     int instSize;
 
     /** Icache stall statistics. */
-//     Stats::Scalar<> icacheStallCycles;
-//     Counter lastIcacheStall;
+    Counter lastIcacheStall;
+
+    Stats::Scalar<> icacheStallCycles;
+    Stats::Scalar<> fetchedInsts;
+    Stats::Scalar<> predictedBranches;
+    Stats::Scalar<> fetchCycles;
+    Stats::Scalar<> fetchSquashCycles;
+    Stats::Scalar<> fetchBlockedCycles;
+    Stats::Scalar<> fetchedCacheLines;
+
+    Stats::Distribution<> fetch_nisn_dist;
 };
 
 #endif //__SIMPLE_FETCH_HH__
diff --git a/cpu/beta_cpu/fetch_impl.hh b/cpu/beta_cpu/fetch_impl.hh
index 93f7bf6d2..8c9cf9f41 100644
--- a/cpu/beta_cpu/fetch_impl.hh
+++ b/cpu/beta_cpu/fetch_impl.hh
@@ -1,10 +1,8 @@
-// Todo: Add in branch prediction.  With probe path, should
-// be able to specify
-// size of data to fetch.  Will be able to get full cache line.
-
-// Remove this later.
+// Remove this later; used only for debugging.
 #define OPCODE(X)                       (X >> 26) & 0x3f
 
+
+#include "arch/alpha/byte_swap.hh"
 #include "cpu/exetrace.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
@@ -37,15 +35,14 @@ SimpleFetch<Impl>::CacheCompletionEvent::description()
 
 template<class Impl>
 SimpleFetch<Impl>::SimpleFetch(Params &params)
-    : cacheCompletionEvent(this),
+    : //cacheCompletionEvent(this),
       icacheInterface(params.icacheInterface),
       branchPred(params),
       decodeToFetchDelay(params.decodeToFetchDelay),
       renameToFetchDelay(params.renameToFetchDelay),
       iewToFetchDelay(params.iewToFetchDelay),
       commitToFetchDelay(params.commitToFetchDelay),
-      fetchWidth(params.fetchWidth),
-      inst(0)
+      fetchWidth(params.fetchWidth)
 {
     // Set status to idle.
     _status = Idle;
@@ -62,13 +59,63 @@ SimpleFetch<Impl>::SimpleFetch(Params &params)
     memReq->data = new uint8_t[64];
 
     // Size of cache block.
-    blkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
+    cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
 
     // Create mask to get rid of offset bits.
-    cacheBlockMask = (blkSize - 1);
+    cacheBlkMask = (cacheBlkSize - 1);
 
     // Get the size of an instruction.
     instSize = sizeof(MachInst);
+
+    // Create space to store a cache line.
+    cacheData = new uint8_t[cacheBlkSize];
+}
+
+template <class Impl>
+void
+SimpleFetch<Impl>::regStats()
+{
+    icacheStallCycles
+        .name(name() + ".icacheStallCycles")
+        .desc("Number of cycles fetch is stalled on an Icache miss")
+        .prereq(icacheStallCycles);
+
+    fetchedInsts
+        .name(name() + ".fetchedInsts")
+        .desc("Number of instructions fetch has processed")
+        .prereq(fetchedInsts);
+    predictedBranches
+        .name(name() + ".predictedBranches")
+        .desc("Number of branches that fetch has predicted taken")
+        .prereq(predictedBranches);
+    fetchCycles
+        .name(name() + ".fetchCycles")
+        .desc("Number of cycles fetch has run and was not squashing or"
+              " blocked")
+        .prereq(fetchCycles);
+    fetchSquashCycles
+        .name(name() + ".fetchSquashCycles")
+        .desc("Number of cycles fetch has spent squashing")
+        .prereq(fetchSquashCycles);
+    fetchBlockedCycles
+        .name(name() + ".fetchBlockedCycles")
+        .desc("Number of cycles fetch has spent blocked")
+        .prereq(fetchBlockedCycles);
+    fetchedCacheLines
+        .name(name() + ".fetchedCacheLines")
+        .desc("Number of cache lines fetched")
+        .prereq(fetchedCacheLines);
+
+    fetch_nisn_dist
+        .init(/* base value */ 0,
+              /* last value */ fetchWidth,
+              /* bucket size */ 1)
+        .name(name() + ".FETCH:rate_dist")
+        .desc("Number of instructions fetched each cycle (Total)")
+        .flags(Stats::pdf)
+        ;
+
+    branchPred.regStats();
 }
 
 template<class Impl>
@@ -122,19 +169,40 @@ SimpleFetch<Impl>::processCacheCompletion()
         _status = IcacheMissComplete;
 }
 
-template<class Impl>
+#if 0
+template <class Impl>
+inline void
+SimpleFetch<Impl>::recordGlobalHist(DynInstPtr &inst)
+{
+    inst->setGlobalHist(branchPred.BPReadGlobalHist());
+}
+#endif
+
+template <class Impl>
 bool
-SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC)
+SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 {
-#if 1
     // Do branch prediction check here.
-    bool predict_taken =  branchPred.BPLookup(next_PC);
-    Addr predict_target;
+    // A bit of a misnomer...next_PC is actually the current PC until
+    // this function updates it.
+    bool predict_taken;
+
+    if (!inst->isControl()) {
+        next_PC = next_PC + instSize;
+        inst->setPredTarg(next_PC);
+        return false;
+    }
+
+    predict_taken = branchPred.predict(inst, next_PC);
+
+#if 0
+    predict_taken = branchPred.BPLookup(next_PC)
 
     DPRINTF(Fetch, "Fetch: Branch predictor predicts taken? %i\n",
             predict_taken);
 
-    if (branchPred.BTBValid(next_PC)) {
+    // Only check the BTB if the BP has predicted taken.
+    if (predict_taken && branchPred.BTBValid(next_PC)) {
         predict_target = branchPred.BTBLookup(next_PC);
         DPRINTF(Fetch, "Fetch: BTB target is %#x.\n", predict_target);
     } else {
@@ -142,42 +210,135 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC)
         DPRINTF(Fetch, "Fetch: BTB does not have a valid entry.\n");
     }
 
-    // Now update the PC to fetch the next instruction in the cache
-    // line.
-    if (!predict_taken) {
-        next_PC = next_PC + instSize;
-        return false;
-    } else {
-        next_PC = predict_target;
-        return true;
-    }
 #endif
+    if (predict_taken) {
+        ++predictedBranches;
+    }
 
-#if 0
-    next_PC = next_PC + instSize;
-    return false;
-#endif
+    return predict_taken;
 }
 
-template<class Impl>
-void
-SimpleFetch<Impl>::squash(Addr new_PC)
+template <class Impl>
+Fault
+SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
+{
+    // Check if the instruction exists within the cache.
+    // If it does, then proceed on to read the instruction and the rest
+    // of the instructions in the cache line until either the end of the
+    // cache line or a predicted taken branch is encountered.
+
+#ifdef FULL_SYSTEM
+    // Flag to say whether or not address is physical addr.
+    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+#else
+    unsigned flags = 0;
+#endif // FULL_SYSTEM
+
+    Fault fault = No_Fault;
+
+    // Align the fetch PC so it's at the start of a cache block.
+    fetch_PC = icacheBlockAlignPC(fetch_PC);
+
+    // Setup the memReq to do a read of the first isntruction's address.
+    // Set the appropriate read size and flags as well.
+    memReq->cmd = Read;
+    memReq->reset(fetch_PC, cacheBlkSize, flags);
+
+    // Translate the instruction request.
+    // Should this function be
+    // in the CPU class ?  Probably...ITB/DTB should exist within the
+    // CPU.
+
+    fault = cpu->translateInstReq(memReq);
+
+    // In the case of faults, the fetch stage may need to stall and wait
+    // on what caused the fetch (ITB or Icache miss).
+
+    // If translation was successful, attempt to read the first
+    // instruction.
+    if (fault == No_Fault) {
+        DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
+        fault = cpu->mem->read(memReq, cacheData);
+        // This read may change when the mem interface changes.
+
+        fetchedCacheLines++;
+    }
+
+    // Now do the timing access to see whether or not the instruction
+    // exists within the cache.
+    if (icacheInterface && fault == No_Fault) {
+        DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
+        memReq->completionEvent = NULL;
+
+        memReq->time = curTick;
+
+        MemAccessResult result = icacheInterface->access(memReq);
+
+        // If the cache missed (in this model functional and timing
+        // memories are different), then schedule an event to wake
+        // up this stage once the cache miss completes.
+        if (result != MA_HIT && icacheInterface->doEvents()) {
+            memReq->completionEvent = new CacheCompletionEvent(this);
+//            lastIcacheStall = curTick;
+
+            // How does current model work as far as individual
+            // stages scheduling/unscheduling?
+            // Perhaps have only the main CPU scheduled/unscheduled,
+            // and have it choose what stages to run appropriately.
+
+            DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
+            _status = IcacheMissStall;
+        }
+    }
+
+    return fault;
+}
+
+template <class Impl>
+inline void
+SimpleFetch<Impl>::doSquash(const Addr &new_PC)
 {
     DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC);
 
     cpu->setNextPC(new_PC + instSize);
     cpu->setPC(new_PC);
 
-    _status = Squashing;
-
     // Clear the icache miss if it's outstanding.
     if (_status == IcacheMissStall && icacheInterface) {
+        DPRINTF(Fetch, "Fetch: Squashing outstanding Icache miss.\n");
         // @todo: Use an actual thread number here.
         icacheInterface->squash(0);
     }
 
-    // Tell the CPU to remove any instructions that aren't currently
-    // in the ROB (instructions in flight that were killed).
+    _status = Squashing;
+
+    ++fetchSquashCycles;
+}
+
+template<class Impl>
+void
+SimpleFetch<Impl>::squashFromDecode(const Addr &new_PC,
+                                    const InstSeqNum &seq_num)
+{
+    DPRINTF(Fetch, "Fetch: Squashing from decode.\n");
+
+    doSquash(new_PC);
+
+    // Tell the CPU to remove any instructions that are in flight between
+    // fetch and decode.
+    cpu->removeInstsUntil(seq_num);
+
+}
+
+template <class Impl>
+void
+SimpleFetch<Impl>::squash(const Addr &new_PC)
+{
+    DPRINTF(Fetch, "Fetch: Squash from commit.\n");
+
+    doSquash(new_PC);
+
+    // Tell the CPU to remove any instructions that are not in the ROB.
     cpu->removeInstsNotInROB();
 }
 
@@ -185,7 +346,6 @@ template<class Impl>
 void
 SimpleFetch<Impl>::tick()
 {
-#if 1
     // Check squash signals from commit.
     if (fromCommit->commitInfo.squash) {
         DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
@@ -196,13 +356,18 @@ SimpleFetch<Impl>::tick()
 
         // Also check if there's a mispredict that happened.
         if (fromCommit->commitInfo.branchMispredict) {
-            branchPred.BPUpdate(fromCommit->commitInfo.mispredPC,
-                                 fromCommit->commitInfo.branchTaken);
-            branchPred.BTBUpdate(fromCommit->commitInfo.mispredPC,
-                                  fromCommit->commitInfo.nextPC);
+            branchPred.squash(fromCommit->commitInfo.doneSeqNum,
+                              fromCommit->commitInfo.nextPC,
+                              fromCommit->commitInfo.branchTaken);
+        } else {
+            branchPred.squash(fromCommit->commitInfo.doneSeqNum);
         }
 
         return;
+    } else if (fromCommit->commitInfo.doneSeqNum) {
+        // Update the branch predictor if it wasn't a squashed instruction
+        // that was braodcasted.
+        branchPred.update(fromCommit->commitInfo.doneSeqNum);
     }
 
     // Check ROB squash signals from commit.
@@ -211,6 +376,8 @@ SimpleFetch<Impl>::tick()
 
         // Continue to squash.
         _status = Squashing;
+
+        ++fetchSquashCycles;
         return;
     }
 
@@ -220,22 +387,22 @@ SimpleFetch<Impl>::tick()
                 "from decode.\n");
 
         // Update the branch predictor.
-        if (fromCommit->decodeInfo.branchMispredict) {
-            branchPred.BPUpdate(fromDecode->decodeInfo.mispredPC,
-                                 fromDecode->decodeInfo.branchTaken);
-            branchPred.BTBUpdate(fromDecode->decodeInfo.mispredPC,
-                                  fromDecode->decodeInfo.nextPC);
+        if (fromDecode->decodeInfo.branchMispredict) {
+            branchPred.squash(fromDecode->decodeInfo.doneSeqNum,
+                              fromDecode->decodeInfo.nextPC,
+                              fromDecode->decodeInfo.branchTaken);
+        } else {
+            branchPred.squash(fromDecode->decodeInfo.doneSeqNum);
         }
 
         if (_status != Squashing) {
             // Squash unless we're already squashing?
-            squash(fromDecode->decodeInfo.nextPC);
+            squashFromDecode(fromDecode->decodeInfo.nextPC,
+                             fromDecode->decodeInfo.doneSeqNum);
             return;
         }
     }
 
-
-
     // Check if any of the stall signals are high.
     if (fromDecode->decodeInfo.stall ||
         fromRename->renameInfo.stall ||
@@ -253,12 +420,15 @@ SimpleFetch<Impl>::tick()
                 fromCommit->commitInfo.stall);
 
         _status = Blocked;
+
+        ++fetchBlockedCycles;
         return;
     } else if (_status == Blocked) {
         // Unblock stage if status is currently blocked and none of the
         // stall signals are being held high.
         _status = Running;
 
+        ++fetchBlockedCycles;
         return;
     }
 
@@ -273,74 +443,15 @@ SimpleFetch<Impl>::tick()
 
         // Switch status to running
         _status = Running;
+
+        ++fetchSquashCycles;
     } else if (_status != IcacheMissStall) {
         DPRINTF(Fetch, "Fetch: Running stage.\n");
 
-        fetch();
-    }
-#endif
-
-#if 0
-    if (_status != Blocked &&
-        _status != Squashing &&
-        _status != IcacheMissStall) {
-        DPRINTF(Fetch, "Fetch: Running stage.\n");
+        ++fetchCycles;
 
         fetch();
-    } else if (_status == Blocked) {
-        // If still being told to stall, do nothing.
-        if (fromDecode->decodeInfo.stall ||
-            fromRename->renameInfo.stall ||
-            fromIEW->iewInfo.stall ||
-            fromCommit->commitInfo.stall)
-        {
-            DPRINTF(Fetch, "Fetch: Stalling stage.\n");
-            DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
-                    "Commit: %i\n",
-                    fromDecode->decodeInfo.stall,
-                    fromRename->renameInfo.stall,
-                    fromIEW->iewInfo.stall,
-                    fromCommit->commitInfo.stall);
-        } else {
-
-            DPRINTF(Fetch, "Fetch: Done blocking.\n");
-            _status = Running;
-        }
-
-        if (fromCommit->commitInfo.squash) {
-            DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                    "from commit.\n");
-            squash(fromCommit->commitInfo.nextPC);
-            return;
-        } else if (fromDecode->decodeInfo.squash) {
-            DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                    "from decode.\n");
-            squash(fromDecode->decodeInfo.nextPC);
-            return;
-        } else if (fromCommit->commitInfo.robSquashing) {
-            DPRINTF(Fetch, "Fetch: ROB is still squashing.\n");
-            _status = Squashing;
-            return;
-        }
-    } else if (_status == Squashing) {
-        // If there are no squash signals then change back to running.
-        // Note that when a squash starts happening, commitInfo.squash will
-        // be high.  But if the squash is still in progress, then only
-        // commitInfo.robSquashing will be high.
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-
-            DPRINTF(Fetch, "Fetch: Done squashing.\n");
-            _status = Running;
-        } else if (fromCommit->commitInfo.squash) {
-            // If there's a new squash, then start squashing again.
-            squash(fromCommit->commitInfo.nextPC);
-        } else {
-            // Purely a debugging statement.
-            DPRINTF(Fetch, "Fetch: ROB still squashing.\n");
-        }
     }
-#endif
 }
 
 template<class Impl>
@@ -351,13 +462,6 @@ SimpleFetch<Impl>::fetch()
     // Start actual fetch
     //////////////////////////////////////////
 
-#ifdef FULL_SYSTEM
-    // Flag to say whether or not address is physical addr.
-    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
-#else
-    unsigned flags = 0;
-#endif // FULL_SYSTEM
-
     // The current PC.
     Addr fetch_PC = cpu->readPC();
 
@@ -379,64 +483,14 @@ SimpleFetch<Impl>::fetch()
                        "instruction, starting at PC %08p.\n",
                 fetch_PC);
 
-        // Otherwise check if the instruction exists within the cache.
-        // If it does, then proceed on to read the instruction and the rest
-        // of the instructions in the cache line until either the end of the
-        // cache line or a predicted taken branch is encountered.
-        // Note that this simply checks if the first instruction exists
-        // within the cache, assuming the rest of the cache line also exists
-        // within the cache.
-
-        // Setup the memReq to do a read of the first isntruction's address.
-        // Set the appropriate read size and flags as well.
-        memReq->cmd = Read;
-        memReq->reset(fetch_PC, instSize, flags);
-
-        // Translate the instruction request.
-        // Should this function be
-        // in the CPU class ?  Probably...ITB/DTB should exist within the
-        // CPU.
-
-        fault = cpu->translateInstReq(memReq);
-
-        // In the case of faults, the fetch stage may need to stall and wait
-        // on what caused the fetch (ITB or Icache miss).
-
-        // If translation was successful, attempt to read the first
-        // instruction.
-        if (fault == No_Fault) {
-            DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
-            fault = cpu->mem->read(memReq, inst);
-            // This read may change when the mem interface changes.
-        }
-
-        // Now do the timing access to see whether or not the instruction
-        // exists within the cache.
-        if (icacheInterface && fault == No_Fault) {
-            DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
-            memReq->completionEvent = NULL;
-
-            memReq->time = curTick;
-
-            MemAccessResult result = icacheInterface->access(memReq);
-
-            // If the cache missed (in this model functional and timing
-            // memories are different), then schedule an event to wake
-            // up this stage once the cache miss completes.
-            if (result != MA_HIT && icacheInterface->doEvents()) {
-                memReq->completionEvent = &cacheCompletionEvent;
-//        	    lastIcacheStall = curTick;
-
-                // How does current model work as far as individual
-                // stages scheduling/unscheduling?
-                // Perhaps have only the main CPU scheduled/unscheduled,
-                // and have it choose what stages to run appropriately.
+        fault = fetchCacheLine(fetch_PC);
+    }
 
-                DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
-                _status = IcacheMissStall;
-                return;
-            }
-        }
+    // If we had a stall due to an icache miss, then return.  It'd
+    // be nicer if this were handled through the kind of fault that
+    // is returned by the function.
+    if (_status == IcacheMissStall) {
+        return;
     }
 
     // As far as timing goes, the CPU will need to send an event through
@@ -446,11 +500,15 @@ SimpleFetch<Impl>::fetch()
 
     Addr next_PC = fetch_PC;
     InstSeqNum inst_seq;
+    MachInst inst;
+    unsigned offset = fetch_PC & cacheBlkMask;
+    unsigned fetched;
 
-    // If the read of the first instruction was successful, then grab the
-    // instructions from the rest of the cache line and put them into the
-    // queue heading to decode.
     if (fault == No_Fault) {
+        // If the read of the first instruction was successful, then grab the
+        // instructions from the rest of the cache line and put them into the
+        // queue heading to decode.
+
         DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n");
 
         //////////////////////////
@@ -461,124 +519,59 @@ SimpleFetch<Impl>::fetch()
         // ended this fetch block.
         bool predicted_branch = false;
 
-        // Might want to keep track of various stats.
-//        numLinesFetched++;
-
-        // Get a sequence number.
-        inst_seq = cpu->getAndIncrementInstSeq();
-
-        // Update the next PC; it either is PC+sizeof(MachInst), or
-        // branch_target.  Check whether or not a branch was taken.
-        predicted_branch = lookupAndUpdateNextPC(next_PC);
-
-        // Because the first instruction was already fetched, create the
-        // DynInst and put it into the queue to decode.
-        DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
-                                             inst_seq, cpu);
-
-        DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                inst_seq, instruction->readPC());
-        DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                OPCODE(inst));
-
-        instruction->traceData =
-            Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
-                                 instruction->staticInst,
-                                 instruction->readPC(), 0);
-
-        cpu->addInst(instruction);
-
-        // Write the instruction to the first slot in the queue
-        // that heads to decode.
-        toDecode->insts[0] = instruction;
-
-        toDecode->size++;
-
-        fetch_PC = next_PC;
-
-        //////////////////////////
-        // Fetch other instructions
-        //////////////////////////
-
-        // Obtain the index into the cache line by getting only the low
-        // order bits.  Will need to do shifting as well.
-        int line_index = fetch_PC & cacheBlockMask;
-
-        // Take instructions and put them into the queue heading to decode.
-        // Then read the next instruction in the cache line.  Continue
-        // until either all of the fetch bandwidth is used (not an issue for
-        // non-SMT), or the end of the cache line is reached.  Note that
-        // this assumes standard cachelines, and not something like a trace
-        // cache where lines might not end at cache-line size aligned
-        // addresses.
-        // @todo: Fix the horrible amount of translates/reads that must
-        // take place due to reading an entire cacheline.  Ideally it
-        // should all take place at once, return an array of binary
-        // instructions, which can then be used to get all the instructions
-        // needed.  Figure out if I can roll it back into one loop.
-        for (int fetched = 1;
-             line_index < blkSize &&
+        for (fetched = 0;
+             offset < cacheBlkSize &&
                  fetched < fetchWidth &&
                  !predicted_branch;
-             line_index+=instSize, ++fetched)
+             ++fetched)
         {
-            // Reset the mem request to setup the read of the next
-            // instruction.
-            memReq->reset(fetch_PC, instSize, flags);
 
-            // Translate the instruction request.
-            fault = cpu->translateInstReq(memReq);
+            // Get a sequence number.
+            inst_seq = cpu->getAndIncrementInstSeq();
 
-            // Read instruction.
-            if (fault == No_Fault) {
-                fault = cpu->mem->read(memReq, inst);
-            }
+            // Make sure this is a valid index.
+            assert(offset <= cacheBlkSize - instSize);
 
-            // Check if there was a fault.
-            if (fault != No_Fault) {
-                panic("Fetch: Read of instruction faulted when it should "
-                      "succeed; most likely exceeding cache line.\n");
-            }
+            // Get the instruction from the array of the cache line.
+            inst = htoa(*reinterpret_cast<MachInst *>
+                        (&cacheData[offset]));
 
-            // Get a sequence number.
-            inst_seq = cpu->getAndIncrementInstSeq();
+            // Create a new DynInst from the instruction fetched.
+            DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
+                                                 inst_seq, cpu);
 
-            predicted_branch = lookupAndUpdateNextPC(next_PC);
+            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
+                    inst_seq, instruction->readPC());
 
-            // Create the actual DynInst.  Parameters are:
-            // DynInst(instruction, PC, predicted PC, CPU pointer).
-            // Because this simple model has no branch prediction, the
-            // predicted PC will simply be PC+sizeof(MachInst).
-            // Update to actually use a branch predictor to predict the
-            // target in the future.
-            DynInstPtr instruction =
-                new DynInst(inst, fetch_PC, next_PC, inst_seq, cpu);
+            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
+                    OPCODE(inst));
 
             instruction->traceData =
                 Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
                                      instruction->staticInst,
                                      instruction->readPC(), 0);
 
-            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                    inst_seq, instruction->readPC());
-            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                    OPCODE(inst));
+            predicted_branch = lookupAndUpdateNextPC(instruction, next_PC);
 
+            // Add instruction to the CPU's list of instructions.
             cpu->addInst(instruction);
 
-            // Write the instruction to the proper slot in the queue
+            // Write the instruction to the first slot in the queue
             // that heads to decode.
             toDecode->insts[fetched] = instruction;
 
             toDecode->size++;
 
-            // Might want to keep track of various stats.
-//             numInstsFetched++;
+            // Increment stat of fetched instructions.
+            ++fetchedInsts;
 
-            // Update the PC with the next PC.
+            // Move to the next instruction, unless we have a branch.
             fetch_PC = next_PC;
+
+            offset+= instSize;
         }
 
+        fetch_nisn_dist.sample(fetched);
     }
 
     // Now that fetching is completed, update the PC to signify what the next
@@ -592,6 +585,12 @@ SimpleFetch<Impl>::fetch()
         cpu->setPC(next_PC);
         cpu->setNextPC(next_PC + instSize);
     } else {
+        // If the issue was an icache miss, then we can just return and
+        // wait until it is handled.
+        if (_status == IcacheMissStall) {
+            return;
+        }
+
         // Handle the fault.
         // This stage will not be able to continue until all the ROB
         // slots are empty, at which point the fault can be handled.
diff --git a/cpu/beta_cpu/free_list.hh b/cpu/beta_cpu/free_list.hh
index 0d2b2c421..e8e75f7ec 100644
--- a/cpu/beta_cpu/free_list.hh
+++ b/cpu/beta_cpu/free_list.hh
@@ -6,11 +6,9 @@
 
 #include "arch/alpha/isa_traits.hh"
 #include "cpu/beta_cpu/comm.hh"
+#include "base/traceflags.hh"
 #include "base/trace.hh"
 
-// Question: Do I even need the number of logical registers?
-// How to avoid freeing registers instantly?  Same with ROB entries.
-
 /**
  * FreeList class that simply holds the list of free integer and floating
  * point registers.  Can request for a free register of either type, and
@@ -153,8 +151,6 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
     assert(!freeIntRegsScoreboard[freed_reg]);
     freeIntRegsScoreboard[freed_reg] = 1;
 
-    //Might want to add in a check for whether or not this register is
-    //already in there.  A bit vector or something similar would be useful.
     freeIntRegs.push(freed_reg);
 }
 
@@ -167,8 +163,6 @@ SimpleFreeList::addFloatReg(PhysRegIndex freed_reg)
     assert(!freeFloatRegsScoreboard[freed_reg]);
     freeFloatRegsScoreboard[freed_reg] = 1;
 
-    //Might want to add in a check for whether or not this register is
-    //already in there.  A bit vector or something similar would be useful.
     freeFloatRegs.push(freed_reg);
 }
 
diff --git a/cpu/beta_cpu/full_cpu.cc b/cpu/beta_cpu/full_cpu.cc
index abeb4cb87..d5228601c 100644
--- a/cpu/beta_cpu/full_cpu.cc
+++ b/cpu/beta_cpu/full_cpu.cc
@@ -168,6 +168,13 @@ FullBetaCPU<Impl>::~FullBetaCPU()
 
 template <class Impl>
 void
+FullBetaCPU<Impl>::fullCPURegStats()
+{
+    // Register any of the FullCPU's stats here.
+}
+
+template <class Impl>
+void
 FullBetaCPU<Impl>::tick()
 {
     DPRINTF(FullCPU, "\n\nFullCPU: Ticking main, FullBetaCPU.\n");
@@ -424,19 +431,17 @@ template <class Impl>
 void
 FullBetaCPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    DynInstPtr inst_to_delete;
+    DynInstPtr inst_to_remove;
 
-    // The front instruction should be the same one being asked to be deleted.
+    // The front instruction should be the same one being asked to be removed.
     assert(instList.front() == inst);
 
     // Remove the front instruction.
-    inst_to_delete = inst;
+    inst_to_remove = inst;
     instList.pop_front();
 
-    DPRINTF(FullCPU, "FullCPU: Deleting committed instruction %#x, PC %#x\n",
-            inst_to_delete, inst_to_delete->readPC());
-
-//    delete inst_to_delete;
+    DPRINTF(FullCPU, "FullCPU: Removing committed instruction %#x, PC %#x\n",
+            inst_to_remove, inst_to_remove->readPC());
 }
 
 template <class Impl>
@@ -453,6 +458,33 @@ FullBetaCPU<Impl>::removeInstsNotInROB()
 
 template <class Impl>
 void
+FullBetaCPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num)
+{
+    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
+            "list.\n");
+
+    DynInstPtr inst_to_delete;
+
+    while (instList.back()->seqNum > seq_num) {
+        assert(!instList.empty());
+
+        // Obtain the pointer to the instruction.
+        inst_to_delete = instList.back();
+
+        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
+                inst_to_delete->seqNum, inst_to_delete->readPC());
+
+        // Remove the instruction from the list.
+        instList.pop_back();
+
+        // Mark it as squashed.
+        inst_to_delete->setSquashed();
+    }
+
+}
+
+template <class Impl>
+void
 FullBetaCPU<Impl>::removeAllInsts()
 {
     instList.clear();
diff --git a/cpu/beta_cpu/full_cpu.hh b/cpu/beta_cpu/full_cpu.hh
index cf753ad67..bde7e5bbf 100644
--- a/cpu/beta_cpu/full_cpu.hh
+++ b/cpu/beta_cpu/full_cpu.hh
@@ -115,6 +115,8 @@ class FullBetaCPU : public BaseFullCPU
 
     void init();
 
+    void fullCPURegStats();
+
     void activateContext(int thread_num, int delay);
     void suspendContext(int thread_num);
     void deallocateContext(int thread_num);
@@ -205,6 +207,9 @@ class FullBetaCPU : public BaseFullCPU
     /** Remove all instructions that are not currently in the ROB. */
     void removeInstsNotInROB();
 
+    /** Remove all instructions younger than the given sequence number. */
+    void removeInstsUntil(const InstSeqNum &seq_num);
+
     /** Remove all instructions from the list. */
     void removeAllInsts();
 
diff --git a/cpu/beta_cpu/iew.hh b/cpu/beta_cpu/iew.hh
index de408ef0c..90bd39e7f 100644
--- a/cpu/beta_cpu/iew.hh
+++ b/cpu/beta_cpu/iew.hh
@@ -9,6 +9,7 @@
 
 #include "base/timebuf.hh"
 #include "cpu/beta_cpu/comm.hh"
+#include "base/statistics.hh"
 
 //Can IEW even stall?  Space should be available/allocated already...maybe
 //if there's not enough write ports on the ROB or waiting for CDB
@@ -50,7 +51,9 @@ class SimpleIEW
   public:
     void squash();
 
-    void squash(DynInstPtr &inst);
+    void squashDueToBranch(DynInstPtr &inst);
+
+    void squashDueToMem(DynInstPtr &inst);
 
     void block();
 
@@ -59,6 +62,8 @@ class SimpleIEW
   public:
     SimpleIEW(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu_ptr);
 
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@@ -76,6 +81,10 @@ class SimpleIEW
     void iew();
 
   private:
+    void dispatchInsts();
+
+    void executeInsts();
+
     //Interfaces to objects inside and outside of IEW.
     /** Time buffer interface. */
     TimeBuffer<TimeStruct> *timeBuffer;
@@ -159,9 +168,23 @@ class SimpleIEW
      */
     unsigned cyclesSquashing;
 
-    //Will implement later
-    //Load queue interface (probably one and the same)
-    //Store queue interface
+    Stats::Scalar<> iewIdleCycles;
+    Stats::Scalar<> iewSquashCycles;
+    Stats::Scalar<> iewBlockCycles;
+    Stats::Scalar<> iewUnblockCycles;
+//    Stats::Scalar<> iewWBInsts;
+    Stats::Scalar<> iewDispatchedInsts;
+    Stats::Scalar<> iewDispSquashedInsts;
+    Stats::Scalar<> iewDispLoadInsts;
+    Stats::Scalar<> iewDispStoreInsts;
+    Stats::Scalar<> iewDispNonSpecInsts;
+    Stats::Scalar<> iewIQFullEvents;
+    Stats::Scalar<> iewExecutedInsts;
+    Stats::Scalar<> iewExecLoadInsts;
+    Stats::Scalar<> iewExecStoreInsts;
+    Stats::Scalar<> iewExecSquashedInsts;
+    Stats::Scalar<> memOrderViolationEvents;
+    Stats::Scalar<> predictedTakenIncorrect;
 };
 
 #endif
diff --git a/cpu/beta_cpu/iew_impl.hh b/cpu/beta_cpu/iew_impl.hh
index 521ce77f6..2bfd6bae9 100644
--- a/cpu/beta_cpu/iew_impl.hh
+++ b/cpu/beta_cpu/iew_impl.hh
@@ -38,6 +38,79 @@ SimpleIEW<Impl, IQ>::SimpleIEW(Params &params)
     instQueue.setIssueToExecuteQueue(&issueToExecQueue);
 }
 
+template <class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::regStats()
+{
+    instQueue.regStats();
+
+    iewIdleCycles
+        .name(name() + ".iewIdleCycles")
+        .desc("Number of cycles IEW is idle");
+
+    iewSquashCycles
+        .name(name() + ".iewSquashCycles")
+        .desc("Number of cycles IEW is squashing");
+
+    iewBlockCycles
+        .name(name() + ".iewBlockCycles")
+        .desc("Number of cycles IEW is blocking");
+
+    iewUnblockCycles
+        .name(name() + ".iewUnblockCycles")
+        .desc("Number of cycles IEW is unblocking");
+
+//    iewWBInsts;
+
+    iewDispatchedInsts
+        .name(name() + ".iewDispatchedInsts")
+        .desc("Number of instructions dispatched to IQ");
+
+    iewDispSquashedInsts
+        .name(name() + ".iewDispSquashedInsts")
+        .desc("Number of squashed instructions skipped by dispatch");
+
+    iewDispLoadInsts
+        .name(name() + ".iewDispLoadInsts")
+        .desc("Number of dispatched load instructions");
+
+    iewDispStoreInsts
+        .name(name() + ".iewDispStoreInsts")
+        .desc("Number of dispatched store instructions");
+
+    iewDispNonSpecInsts
+        .name(name() + ".iewDispNonSpecInsts")
+        .desc("Number of dispatched non-speculative instructions");
+
+    iewIQFullEvents
+        .name(name() + ".iewIQFullEvents")
+        .desc("Number of times the IQ has become full, causing a stall");
+
+    iewExecutedInsts
+        .name(name() + ".iewExecutedInsts")
+        .desc("Number of executed instructions");
+
+    iewExecLoadInsts
+        .name(name() + ".iewExecLoadInsts")
+        .desc("Number of load instructions executed");
+
+    iewExecStoreInsts
+        .name(name() + ".iewExecStoreInsts")
+        .desc("Number of store instructions executed");
+
+    iewExecSquashedInsts
+        .name(name() + ".iewExecSquashedInsts")
+        .desc("Number of squashed instructions skipped in execute");
+
+    memOrderViolationEvents
+        .name(name() + ".memOrderViolationEvents")
+        .desc("Number of memory order violations");
+
+    predictedTakenIncorrect
+        .name(name() + ".predictedTakenIncorrect")
+        .desc("Number of branches that were predicted taken incorrectly");
+}
+
 template<class Impl, class IQ>
 void
 SimpleIEW<Impl, IQ>::setCPU(FullCPU *cpu_ptr)
@@ -158,7 +231,7 @@ SimpleIEW<Impl, IQ>::squash()
 
 template<class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst)
+SimpleIEW<Impl, IQ>::squashDueToBranch(DynInstPtr &inst)
 {
     DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
             inst->PC);
@@ -167,119 +240,39 @@ SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst)
     _status = Squashing;
 
     // Tell rename to squash through the time buffer.
-    toRename->iewInfo.squash = true;
+    toCommit->squash = true;
     // Also send PC update information back to prior stages.
-    toRename->iewInfo.squashedSeqNum = inst->seqNum;
-    toRename->iewInfo.mispredPC = inst->readPC();
-    toRename->iewInfo.nextPC = inst->readCalcTarg();
-    toRename->iewInfo.branchMispredict = true;
+    toCommit->squashedSeqNum = inst->seqNum;
+    toCommit->mispredPC = inst->readPC();
+    toCommit->nextPC = inst->readCalcTarg();
+    toCommit->branchMispredict = true;
     // Prediction was incorrect, so send back inverse.
-    toRename->iewInfo.branchTaken = !(inst->predTaken());
+    toCommit->branchTaken = inst->readCalcTarg() !=
+        (inst->readPC() + sizeof(MachInst));
+//    toCommit->globalHist = inst->readGlobalHist();
 }
 
 template<class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::tick()
+SimpleIEW<Impl, IQ>::squashDueToMem(DynInstPtr &inst)
 {
-    // Considering putting all the state-determining stuff in this section.
-
-    // Try to fill up issue queue with as many instructions as bandwidth
-    // allows.
-    // Decode should try to execute as many instructions as its bandwidth
-    // will allow, as long as it is not currently blocked.
-
-    // Check if the stage is in a running status.
-    if (_status != Blocked && _status != Squashing) {
-        DPRINTF(IEW, "IEW: Status is not blocked, attempting to run "
-                     "stage.\n");
-        iew();
-
-        // If it's currently unblocking, check to see if it should switch
-        // to running.
-        if (_status == Unblocking) {
-            unblock();
-        }
-    } else if (_status == Squashing) {
-
-        DPRINTF(IEW, "IEW: Still squashing.\n");
-
-        // Check if stage should remain squashing.  Stop squashing if the
-        // squash signal clears.
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-            DPRINTF(IEW, "IEW: Done squashing, changing status to "
-                    "running.\n");
-
-            _status = Running;
-            instQueue.stopSquash();
-        } else {
-            instQueue.doSquash();
-        }
-
-        // Also should advance its own time buffers if the stage ran.
-        // Not sure about this...
-//        issueToExecQueue.advance();
-    } else if (_status == Blocked) {
-        // Continue to tell previous stage to stall.
-        toRename->iewInfo.stall = true;
-
-        // Check if possible stall conditions have cleared.
-        if (!fromCommit->commitInfo.stall &&
-            !instQueue.isFull()) {
-            DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n");
-            _status = Unblocking;
-        }
-
-        // If there's still instructions coming from rename, continue to
-        // put them on the skid buffer.
-        if (fromRename->insts[0]) {
-            block();
-        }
-
-        if (fromCommit->commitInfo.squash ||
-            fromCommit->commitInfo.robSquashing) {
-            squash();
-        }
-    }
-
-    // @todo: Maybe put these at the beginning, so if it's idle it can
-    // return early.
-    // Write back number of free IQ entries here.
-    toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries();
-
-    // Check the committed load/store signals to see if there's a load
-    // or store to commit.  Also check if it's being told to execute a
-    // nonspeculative instruction.
-    if (fromCommit->commitInfo.commitIsStore) {
-        ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum);
-    } else if (fromCommit->commitInfo.commitIsLoad) {
-        ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum);
-    }
-
-    if (fromCommit->commitInfo.nonSpecSeqNum != 0) {
-        instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum);
-    }
+    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
+            inst->PC);
+    // Perhaps leave the squashing up to the ROB stage to tell it when to
+    // squash?
+    _status = Squashing;
 
-    DPRINTF(IEW, "IEW: IQ has %i free entries.\n",
-            instQueue.numFreeEntries());
+    // Tell rename to squash through the time buffer.
+    toCommit->squash = true;
+    // Also send PC update information back to prior stages.
+    toCommit->squashedSeqNum = inst->seqNum;
+    toCommit->nextPC = inst->readCalcTarg();
 }
 
-template<class Impl, class IQ>
+template <class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::iew()
+SimpleIEW<Impl, IQ>::dispatchInsts()
 {
-    // Might want to put all state checks in the tick() function.
-    // Check if being told to stall from commit.
-    if (fromCommit->commitInfo.stall) {
-        block();
-        return;
-    } else if (fromCommit->commitInfo.squash ||
-               fromCommit->commitInfo.robSquashing) {
-        // Also check if commit is telling this stage to squash.
-        squash();
-        return;
-    }
-
     ////////////////////////////////////////
     // DISPATCH/ISSUE stage
     ////////////////////////////////////////
@@ -289,29 +282,36 @@ SimpleIEW<Impl, IQ>::iew()
 
     // Check if there are any instructions coming from rename, and we're.
     // not squashing.
-    if (fromRename->insts[0] && _status != Squashing) {
+    if (fromRename->size > 0) {
+        int insts_to_add = fromRename->size;
 
         // Loop through the instructions, putting them in the instruction
         // queue.
-        for (int inst_num = 0; inst_num < issueReadWidth; ++inst_num)
+        for (int inst_num = 0; inst_num < insts_to_add; ++inst_num)
         {
             DynInstPtr inst = fromRename->insts[inst_num];
 
             // Make sure there's a valid instruction there.
-            if (!inst)
-                break;
+            assert(inst);
 
             DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
                     inst->readPC());
 
-            // If it's a memory reference, don't put it in the
-            // instruction queue.  These will only be executed at commit.
-            // Do the same for nonspeculative instructions and nops.
             // Be sure to mark these instructions as ready so that the
             // commit stage can go ahead and execute them, and mark
             // them as issued so the IQ doesn't reprocess them.
             if (inst->isSquashed()) {
+                ++iewDispSquashedInsts;
                 continue;
+            } else if (instQueue.isFull()) {
+                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
+                // Call function to start blocking.
+                block();
+                // Tell previous stage to stall.
+                toRename->iewInfo.stall = true;
+
+                ++iewIQFullEvents;
+                break;
             } else if (inst->isLoad()) {
                 DPRINTF(IEW, "IEW: Issue: Memory instruction "
                         "encountered, adding to LDSTQ.\n");
@@ -320,6 +320,7 @@ SimpleIEW<Impl, IQ>::iew()
                 // memory access.
                 ldstQueue.insertLoad(inst);
 
+                ++iewDispLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.insertStore(inst);
 
@@ -327,10 +328,15 @@ SimpleIEW<Impl, IQ>::iew()
                 // the commit stage will try committing it, and then
                 // once commit realizes it's a store it will send back
                 // a signal to this stage to issue and execute that
-                // store.
+                // store.  Change to be a bit that says the instruction
+                // has extra work to do at commit.
                 inst->setCanCommit();
 
                 instQueue.insertNonSpec(inst);
+
+                ++iewDispStoreInsts;
+                ++iewDispNonSpecInsts;
+
                 continue;
             } else if (inst->isNonSpeculative()) {
                 DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
@@ -342,6 +348,8 @@ SimpleIEW<Impl, IQ>::iew()
                 // Specificall insert it as nonspeculative.
                 instQueue.insertNonSpec(inst);
 
+                ++iewDispNonSpecInsts;
+
                 continue;
             } else if (inst->isNop()) {
                 DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
@@ -352,25 +360,35 @@ SimpleIEW<Impl, IQ>::iew()
                 inst->setCanCommit();
 
                 instQueue.advanceTail(inst);
+
+                continue;
+            } else if (inst->isExecuted()) {
+                DPRINTF(IEW, "IEW: Issue: Executed branch encountered, "
+                        "skipping.\n");
+
+                assert(inst->isDirectCtrl());
+
+                inst->setIssued();
+                inst->setCanCommit();
+
+                instQueue.advanceTail(inst);
+
                 continue;
-            } else if (instQueue.isFull()) {
-                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
-                // Call function to start blocking.
-                block();
-                // Tell previous stage to stall.
-                toRename->iewInfo.stall = true;
-                break;
             }
 
             // If the instruction queue is not full, then add the
             // instruction.
             instQueue.insert(fromRename->insts[inst_num]);
+
+            ++iewDispatchedInsts;
         }
     }
+}
 
-    // Have the instruction queue try to schedule any ready instructions.
-    instQueue.scheduleReadyInsts();
-
+template <class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::executeInsts()
+{
     ////////////////////////////////////////
     //EXECUTE/WRITEBACK stage
     ////////////////////////////////////////
@@ -389,9 +407,10 @@ SimpleIEW<Impl, IQ>::iew()
     // Execute/writeback any instructions that are available.
     for (int inst_num = 0;
          fu_usage < executeWidth && /* Haven't exceeded available FU's. */
-         inst_num < issueWidth && /* Haven't exceeded issue width. */
-         fromIssue->insts[inst_num]; /* There are available instructions. */
+             inst_num < issueWidth &&
+             fromIssue->insts[inst_num];
          ++inst_num) {
+
         DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
 
         // Get instruction from issue's queue.
@@ -410,6 +429,8 @@ SimpleIEW<Impl, IQ>::iew()
 
             toCommit->insts[inst_num] = inst;
 
+            ++iewExecSquashedInsts;
+
             continue;
         }
 
@@ -428,14 +449,20 @@ SimpleIEW<Impl, IQ>::iew()
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isLoad()) {
                 ldstQueue.executeLoad(inst);
+
+                ++iewExecLoadInsts;
             } else if (inst->isStore()) {
                 ldstQueue.executeStore();
+
+                ++iewExecStoreInsts;
             } else {
                 panic("IEW: Unexpected memory type!\n");
             }
 
         } else {
             inst->execute();
+
+            ++iewExecutedInsts;
         }
 
         // First check the time slot that this instruction will write
@@ -464,25 +491,148 @@ SimpleIEW<Impl, IQ>::iew()
                         inst->nextPC);
 
                 // If incorrect, then signal the ROB that it must be squashed.
-                squash(inst);
+                squashDueToBranch(inst);
+
+                if (inst->predTaken()) {
+                    predictedTakenIncorrect++;
+                }
             } else if (ldstQueue.violation()) {
                 fetch_redirect = true;
 
+                // Get the DynInst that caused the violation.
                 DynInstPtr violator = ldstQueue.getMemDepViolator();
 
                 DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
                         "%#x, inst PC: %#x.  Addr is: %#x.\n",
                         violator->readPC(), inst->readPC(), inst->physEffAddr);
 
+                // Tell the instruction queue that a violation has occured.
                 instQueue.violation(inst, violator);
 
-                squash(inst);
-                // Otherwise check if there was a memory ordering violation.
-                // If there was, then signal ROB that it must be squashed.  Also
-                // signal IQ that there was a violation.
+                // Squash.
+                squashDueToMem(inst);
+
+                ++memOrderViolationEvents;
             }
         }
     }
+}
+
+template<class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::tick()
+{
+    // Considering putting all the state-determining stuff in this section.
+
+    // Try to fill up issue queue with as many instructions as bandwidth
+    // allows.
+    // Decode should try to execute as many instructions as its bandwidth
+    // will allow, as long as it is not currently blocked.
+
+    // Check if the stage is in a running status.
+    if (_status != Blocked && _status != Squashing) {
+        DPRINTF(IEW, "IEW: Status is not blocked, attempting to run "
+                     "stage.\n");
+        iew();
+
+        // If it's currently unblocking, check to see if it should switch
+        // to running.
+        if (_status == Unblocking) {
+            unblock();
+
+            ++iewUnblockCycles;
+        }
+    } else if (_status == Squashing) {
+
+        DPRINTF(IEW, "IEW: Still squashing.\n");
+
+        // Check if stage should remain squashing.  Stop squashing if the
+        // squash signal clears.
+        if (!fromCommit->commitInfo.squash &&
+            !fromCommit->commitInfo.robSquashing) {
+            DPRINTF(IEW, "IEW: Done squashing, changing status to "
+                    "running.\n");
+
+            _status = Running;
+            instQueue.stopSquash();
+        } else {
+            instQueue.doSquash();
+        }
+
+        ++iewSquashCycles;
+
+        // Also should advance its own time buffers if the stage ran.
+        // Not sure about this...
+//        issueToExecQueue.advance();
+    } else if (_status == Blocked) {
+        // Continue to tell previous stage to stall.
+        toRename->iewInfo.stall = true;
+
+        // Check if possible stall conditions have cleared.
+        if (!fromCommit->commitInfo.stall &&
+            !instQueue.isFull()) {
+            DPRINTF(IEW, "IEW: Stall signals cleared, going to unblock.\n");
+            _status = Unblocking;
+        }
+
+        // If there's still instructions coming from rename, continue to
+        // put them on the skid buffer.
+        if (fromRename->size == 0) {
+            block();
+        }
+
+        if (fromCommit->commitInfo.squash ||
+            fromCommit->commitInfo.robSquashing) {
+            squash();
+        }
+
+        ++iewBlockCycles;
+    }
+
+    // @todo: Maybe put these at the beginning, so if it's idle it can
+    // return early.
+    // Write back number of free IQ entries here.
+    toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries();
+
+    // Check the committed load/store signals to see if there's a load
+    // or store to commit.  Also check if it's being told to execute a
+    // nonspeculative instruction.
+    if (fromCommit->commitInfo.commitIsStore) {
+        ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum);
+    } else if (fromCommit->commitInfo.commitIsLoad) {
+        ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum);
+    }
+
+    if (fromCommit->commitInfo.nonSpecSeqNum != 0) {
+        instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum);
+    }
+
+    DPRINTF(IEW, "IEW: IQ has %i free entries.\n",
+            instQueue.numFreeEntries());
+}
+
+template<class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::iew()
+{
+    // Might want to put all state checks in the tick() function.
+    // Check if being told to stall from commit.
+    if (fromCommit->commitInfo.stall) {
+        block();
+        return;
+    } else if (fromCommit->commitInfo.squash ||
+               fromCommit->commitInfo.robSquashing) {
+        // Also check if commit is telling this stage to squash.
+        squash();
+        return;
+    }
+
+    dispatchInsts();
+
+    // Have the instruction queue try to schedule any ready instructions.
+    instQueue.scheduleReadyInsts();
+
+    executeInsts();
 
     // Loop through the head of the time buffer and wake any dependents.
     // These instructions are about to write back.  In the simple model
@@ -491,7 +641,7 @@ SimpleIEW<Impl, IQ>::iew()
     // Also mark scoreboard that this instruction is finally complete.
     // Either have IEW have direct access to rename map, or have this as
     // part of backwards communication.
-    for (int inst_num = 0; inst_num < executeWidth &&
+    for (int inst_num = 0; inst_num < issueWidth &&
              toCommit->insts[inst_num]; inst_num++)
     {
         DynInstPtr inst = toCommit->insts[inst_num];
diff --git a/cpu/beta_cpu/inst_queue.cc b/cpu/beta_cpu/inst_queue.cc
index 43b0a4572..c4fd077bc 100644
--- a/cpu/beta_cpu/inst_queue.cc
+++ b/cpu/beta_cpu/inst_queue.cc
@@ -5,3 +5,6 @@
 
 // Force instantiation of InstructionQueue.
 template InstructionQueue<AlphaSimpleImpl>;
+
+unsigned
+InstructionQueue<AlphaSimpleImpl>::DependencyEntry::mem_alloc_counter = 0;
diff --git a/cpu/beta_cpu/inst_queue.hh b/cpu/beta_cpu/inst_queue.hh
index a170979cb..6fcce70a4 100644
--- a/cpu/beta_cpu/inst_queue.hh
+++ b/cpu/beta_cpu/inst_queue.hh
@@ -7,14 +7,10 @@
 #include <stdint.h>
 #include <vector>
 
+#include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"
 
-//Perhaps have a better separation between the data structure underlying
-//and the actual algorithm.
-//somewhat nasty to try to have a nice ordering.
-// Consider moving to STL list or slist for the LL stuff.
-
 /**
  * A standard instruction queue class.  It holds instructions in an
  * array, holds the ordering of the instructions within a linked list,
@@ -74,6 +70,8 @@ class InstructionQueue
 
     InstructionQueue(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu);
 
     void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue);
@@ -98,6 +96,7 @@ class InstructionQueue
 
     void violation(DynInstPtr &store, DynInstPtr &faulting_load);
 
+    // Change this to take in the sequence number
     void squash();
 
     void doSquash();
@@ -159,7 +158,7 @@ class InstructionQueue
     ReadyInstQueue readyBranchInsts;
 
     /** List of ready memory instructions. */
-    ReadyInstQueue readyMemInsts;
+//    ReadyInstQueue readyMemInsts;
 
     /** List of ready miscellaneous instructions. */
     ReadyInstQueue readyMiscInsts;
@@ -228,9 +227,6 @@ class InstructionQueue
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum;
 
-    /** Iterator that points to the oldest instruction in the IQ. */
-//    ListIt head;
-
     /** Iterator that points to the youngest instruction in the IQ. */
     ListIt tail;
 
@@ -261,6 +257,9 @@ class InstructionQueue
         void insert(DynInstPtr &new_inst);
 
         void remove(DynInstPtr &inst_to_remove);
+
+        // Debug variable, remove when done testing.
+        static unsigned mem_alloc_counter;
     };
 
     /** Array of linked lists.  Each linked list is a list of all the
@@ -285,6 +284,25 @@ class InstructionQueue
     void dumpDependGraph();
 
     void addIfReady(DynInstPtr &inst);
+
+    Stats::Scalar<> iqInstsAdded;
+    Stats::Scalar<> iqNonSpecInstsAdded;
+//    Stats::Scalar<> iqIntInstsAdded;
+    Stats::Scalar<> iqIntInstsIssued;
+//    Stats::Scalar<> iqFloatInstsAdded;
+    Stats::Scalar<> iqFloatInstsIssued;
+//    Stats::Scalar<> iqBranchInstsAdded;
+    Stats::Scalar<> iqBranchInstsIssued;
+//    Stats::Scalar<> iqMemInstsAdded;
+    Stats::Scalar<> iqMemInstsIssued;
+//    Stats::Scalar<> iqMiscInstsAdded;
+    Stats::Scalar<> iqMiscInstsIssued;
+    Stats::Scalar<> iqSquashedInstsIssued;
+    Stats::Scalar<> iqLoopSquashStalls;
+    Stats::Scalar<> iqSquashedInstsExamined;
+    Stats::Scalar<> iqSquashedOperandsExamined;
+    Stats::Scalar<> iqSquashedNonSpecRemoved;
+
 };
 
 #endif //__INST_QUEUE_HH__
diff --git a/cpu/beta_cpu/inst_queue_impl.hh b/cpu/beta_cpu/inst_queue_impl.hh
index 03e3fed33..c688181ed 100644
--- a/cpu/beta_cpu/inst_queue_impl.hh
+++ b/cpu/beta_cpu/inst_queue_impl.hh
@@ -24,15 +24,13 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
       numEntries(params.numIQEntries),
       intWidth(params.executeIntWidth),
       floatWidth(params.executeFloatWidth),
+      branchWidth(params.executeBranchWidth),
+      memoryWidth(params.executeMemoryWidth),
       totalWidth(params.issueWidth),
       numPhysIntRegs(params.numPhysIntRegs),
       numPhysFloatRegs(params.numPhysFloatRegs),
       commitToIEWDelay(params.commitToIEWDelay)
 {
-    // HACK: HARDCODED NUMBER.  REMOVE LATER AND ADD TO PARAMETER.
-    branchWidth = 1;
-    memoryWidth = 1;
-
     DPRINTF(IQ, "IQ: Int width is %i.\n", params.executeIntWidth);
 
     // Initialize the number of free IQ entries.
@@ -68,6 +66,87 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
 
 template <class Impl>
 void
+InstructionQueue<Impl>::regStats()
+{
+    iqInstsAdded
+        .name(name() + ".iqInstsAdded")
+        .desc("Number of instructions added to the IQ (excludes non-spec)")
+        .prereq(iqInstsAdded);
+
+    iqNonSpecInstsAdded
+        .name(name() + ".iqNonSpecInstsAdded")
+        .desc("Number of non-speculative instructions added to the IQ")
+        .prereq(iqNonSpecInstsAdded);
+
+//    iqIntInstsAdded;
+
+    iqIntInstsIssued
+        .name(name() + ".iqIntInstsIssued")
+        .desc("Number of integer instructions issued")
+        .prereq(iqIntInstsIssued);
+
+//    iqFloatInstsAdded;
+
+    iqFloatInstsIssued
+        .name(name() + ".iqFloatInstsIssued")
+        .desc("Number of float instructions issued")
+        .prereq(iqFloatInstsIssued);
+
+//    iqBranchInstsAdded;
+
+    iqBranchInstsIssued
+        .name(name() + ".iqBranchInstsIssued")
+        .desc("Number of branch instructions issued")
+        .prereq(iqBranchInstsIssued);
+
+//    iqMemInstsAdded;
+
+    iqMemInstsIssued
+        .name(name() + ".iqMemInstsIssued")
+        .desc("Number of memory instructions issued")
+        .prereq(iqMemInstsIssued);
+
+//    iqMiscInstsAdded;
+
+    iqMiscInstsIssued
+        .name(name() + ".iqMiscInstsIssued")
+        .desc("Number of miscellaneous instructions issued")
+        .prereq(iqMiscInstsIssued);
+
+    iqSquashedInstsIssued
+        .name(name() + ".iqSquashedInstsIssued")
+        .desc("Number of squashed instructions issued")
+        .prereq(iqSquashedInstsIssued);
+
+    iqLoopSquashStalls
+        .name(name() + ".iqLoopSquashStalls")
+        .desc("Number of times issue loop had to restart due to squashed "
+              "inst; mainly for profiling")
+        .prereq(iqLoopSquashStalls);
+
+    iqSquashedInstsExamined
+        .name(name() + ".iqSquashedInstsExamined")
+        .desc("Number of squashed instructions iterated over during squash;"
+              " mainly for profiling")
+        .prereq(iqSquashedInstsExamined);
+
+    iqSquashedOperandsExamined
+        .name(name() + ".iqSquashedOperandsExamined")
+        .desc("Number of squashed operands that are examined and possibly "
+              "removed from graph")
+        .prereq(iqSquashedOperandsExamined);
+
+    iqSquashedNonSpecRemoved
+        .name(name() + ".iqSquashedNonSpecRemoved")
+        .desc("Number of squashed non-spec instructions that were removed")
+        .prereq(iqSquashedNonSpecRemoved);
+
+    // Tell mem dependence unit to reg stats as well.
+    memDepUnit.regStats();
+}
+
+template <class Impl>
+void
 InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
@@ -161,10 +240,14 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
     // unit.
     if (new_inst->isMemRef()) {
         memDepUnit.insert(new_inst);
+        // Uh..forgot to look it up and put it on the proper dependency list
+        // if the instruction should not go yet.
+    } else {
+        // If the instruction is ready then add it to the ready list.
+        addIfReady(new_inst);
     }
 
-    // If the instruction is ready then add it to the ready list.
-    addIfReady(new_inst);
+    ++iqInstsAdded;
 
     assert(freeEntries == (numEntries - countInsts()));
 }
@@ -219,13 +302,16 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst)
     // If it's a memory instruction, add it to the memory dependency
     // unit.
     if (inst->isMemRef()) {
-        memDepUnit.insert(inst);
+        memDepUnit.insertNonSpec(inst);
     }
+
+    ++iqNonSpecInstsAdded;
 }
 
 // Slightly hack function to advance the tail iterator in the case that
 // the IEW stage issues an instruction that is not added to the IQ.  This
 // is needed in case a long chain of such instructions occurs.
+// I don't think this is used anymore.
 template <class Impl>
 void
 InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
@@ -288,7 +374,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     bool insts_available = !readyBranchInsts.empty() ||
         !readyIntInsts.empty() ||
         !readyFloatInsts.empty() ||
-        !readyMemInsts.empty() ||
+        !memDepUnit.empty() ||
         !readyMiscInsts.empty() ||
         !squashedInsts.empty();
 
@@ -327,6 +413,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             if (int_head_inst->isSquashed()) {
                 readyIntInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                 continue;
             }
 
@@ -344,6 +433,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             if (float_head_inst->isSquashed()) {
                 readyFloatInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                 continue;
             } else if (float_head_inst->seqNum < oldest_inst) {
                 oldest_inst = float_head_inst->seqNum;
@@ -361,6 +453,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             if (branch_head_inst->isSquashed()) {
                 readyBranchInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                 continue;
             } else if (branch_head_inst->seqNum < oldest_inst) {
                 oldest_inst = branch_head_inst->seqNum;
@@ -370,15 +465,18 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         }
 
-        if (!readyMemInsts.empty() &&
+        if (!memDepUnit.empty() &&
             memory_issued < memoryWidth) {
 
             insts_available = true;
 
-            mem_head_inst = readyMemInsts.top();
+            mem_head_inst = memDepUnit.top();
 
             if (mem_head_inst->isSquashed()) {
-                readyMemInsts.pop();
+                memDepUnit.pop();
+
+                ++iqLoopSquashStalls;
+
                 continue;
             } else if (mem_head_inst->seqNum < oldest_inst) {
                 oldest_inst = mem_head_inst->seqNum;
@@ -395,6 +493,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             if (misc_head_inst->isSquashed()) {
                 readyMiscInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                 continue;
             } else if (misc_head_inst->seqNum < oldest_inst) {
                 oldest_inst = misc_head_inst->seqNum;
@@ -450,9 +551,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
           case Memory:
             issuing_inst = mem_head_inst;
 
-            memDepUnit.issue(mem_head_inst);
-
-            readyMemInsts.pop();
+            memDepUnit.pop();
             ++memory_issued;
             DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n",
                     issuing_inst->readPC());
@@ -461,6 +560,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
           case Misc:
             issuing_inst = misc_head_inst;
             readyMiscInsts.pop();
+
+            ++iqMiscInstsIssued;
+
             DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n",
                     issuing_inst->readPC());
             break;
@@ -476,6 +578,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         if (list_with_oldest != None) {
             i2e_info->insts[total_issued] = issuing_inst;
+            i2e_info->size++;
 
             issuing_inst->setIssued();
 
@@ -485,12 +588,21 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         assert(freeEntries == (numEntries - countInsts()));
     }
+
+    iqIntInstsIssued += int_issued;
+    iqFloatInstsIssued += float_issued;
+    iqBranchInstsIssued += branch_issued;
+    iqMemInstsIssued += memory_issued;
+    iqSquashedInstsIssued += squashed_issued;
 }
 
 template <class Impl>
 void
 InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 {
+    DPRINTF(IQ, "IQ: Marking nonspeculative instruction with sequence "
+            "number %i as ready to execute.\n", inst);
+
     non_spec_it_t inst_it = nonSpecInsts.find(inst);
 
     assert(inst_it != nonSpecInsts.end());
@@ -499,7 +611,11 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
     (*inst_it).second->setCanIssue();
 
     // Now schedule the instruction.
-    addIfReady((*inst_it).second);
+    if (!(*inst_it).second->isMemRef()) {
+        addIfReady((*inst_it).second);
+    } else {
+        memDepUnit.nonSpecInstReady((*inst_it).second);
+    }
 
     nonSpecInsts.erase(inst_it);
 }
@@ -552,6 +668,7 @@ InstructionQueue<Impl>::doSquash()
         // hasn't already been squashed in the IQ.
         if (!squashed_inst->isIssued() &&
             !squashed_inst->isSquashedInIQ()) {
+
             // Remove the instruction from the dependency list.
             // Hack for now: These below don't add themselves to the
             // dependency list, so don't try to remove them.
@@ -576,7 +693,15 @@ InstructionQueue<Impl>::doSquash()
                         src_reg < numPhysRegs) {
                         dependGraph[src_reg].remove(squashed_inst);
                     }
+
+                    ++iqSquashedOperandsExamined;
                 }
+
+                // Might want to remove producers as well.
+            } else {
+                nonSpecInsts.erase(squashed_inst->seqNum);
+
+                ++iqSquashedNonSpecRemoved;
             }
 
             // Might want to also clear out the head of the dependency graph.
@@ -590,11 +715,8 @@ InstructionQueue<Impl>::doSquash()
                     squashed_inst->readPC());
         }
 
-        if (squashed_inst->isNonSpeculative() || squashed_inst->isStore()) {
-            nonSpecInsts.erase(squashed_inst->seqNum);
-        }
-
         --squashIt;
+        ++iqSquashedInstsExamined;
     }
 }
 
@@ -665,6 +787,8 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 
             dependGraph[dest_reg].next = curr->next;
 
+            DependencyEntry::mem_alloc_counter--;
+
             delete curr;
         }
 
@@ -749,13 +873,9 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
         }
 
         dependGraph[dest_reg].inst = new_inst;
-#if 0
-        if (dependGraph[dest_reg].next) {
-            panic("Dependency chain of dest reg %i is not empty.\n",
-                  dest_reg);
-        }
-#endif
+
         assert(!dependGraph[dest_reg].next);
+
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
     }
@@ -776,6 +896,8 @@ InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)
 
     // Then actually add it to the chain.
     this->next = new_entry;
+
+    ++mem_alloc_counter;
 }
 
 template <class Impl>
@@ -805,6 +927,8 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
     // Now remove this instruction from the list.
     prev->next = curr->next;
 
+    --mem_alloc_counter;
+
     delete curr;
 }
 
@@ -855,12 +979,26 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 
             DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n");
 
+            // Message to the mem dependence unit that this instruction has
+            // its registers ready.
+
+            memDepUnit.regsReady(inst);
+
+#if 0
             if (memDepUnit.readyToIssue(inst)) {
                 DPRINTF(IQ, "IQ: Memory instruction is ready to issue, "
                         "putting it onto the ready list, PC %#x.\n",
                         inst->readPC());
                 readyMemInsts.push(inst);
+            } else {
+                // Make dependent on the store.
+                // Will need some way to get the store instruction it should
+                // be dependent upon; then when the store issues it can
+                // put the instruction on the ready list.
+                // Yet another tree?
+                assert(0 && "Instruction has no way to actually issue");
             }
+#endif
 
         } else if (inst->isInteger()) {
 
@@ -923,7 +1061,7 @@ InstructionQueue<Impl>::dumpLists()
 
     cprintf("Ready branch list size: %i\n", readyBranchInsts.size());
 
-    cprintf("Ready memory list size: %i\n", readyMemInsts.size());
+//    cprintf("Ready memory list size: %i\n", readyMemInsts.size());
 
     cprintf("Ready misc list size: %i\n", readyMiscInsts.size());
 
diff --git a/cpu/beta_cpu/mem_dep_unit.hh b/cpu/beta_cpu/mem_dep_unit.hh
index 4821c63b7..e43543e09 100644
--- a/cpu/beta_cpu/mem_dep_unit.hh
+++ b/cpu/beta_cpu/mem_dep_unit.hh
@@ -6,6 +6,7 @@
 #include <map>
 
 #include "cpu/inst_seq.hh"
+#include "base/statistics.hh"
 
 /**
  * Memory dependency unit class.  This holds the memory dependence predictor.
@@ -25,16 +26,17 @@ class MemDepUnit {
     typedef typename Impl::DynInstPtr DynInstPtr;
 
   public:
-    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
-    typedef typename std::map<InstSeqNum, vector<InstSeqNum> >::iterator
-    dep_it_t;
-
-  public:
     MemDepUnit(Params &params);
 
+    void regStats();
+
     void insert(DynInstPtr &inst);
 
-    bool readyToIssue(DynInstPtr &inst);
+    void insertNonSpec(DynInstPtr &inst);
+
+    void regsReady(DynInstPtr &inst);
+
+    void nonSpecInstReady(DynInstPtr &inst);
 
     void issue(DynInstPtr &inst);
 
@@ -44,19 +46,83 @@ class MemDepUnit {
 
     void violation(DynInstPtr &store_inst, DynInstPtr &violating_load);
 
+    // Will want to make this operation relatively fast.  Right now it
+    // kind of sucks.
+    DynInstPtr &top();
+
+    void pop();
+
+    inline bool empty()
+    { return readyInsts.empty(); }
+
+  private:
+    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator dyn_it_t;
+
+    // Forward declarations so that the following two typedefs work.
+    class Dependency;
+    class ltDependency;
+
+    typedef typename std::set<Dependency, ltDependency>::iterator dep_it_t;
+    typedef typename std::map<InstSeqNum, vector<dep_it_t> >::iterator
+    sd_it_t;
+
+    struct Dependency {
+        Dependency(const InstSeqNum &_seqNum)
+            : seqNum(_seqNum), regsReady(0), memDepReady(0)
+        { }
+
+        Dependency(const InstSeqNum &_seqNum, bool _regsReady,
+                   bool _memDepReady)
+            : seqNum(_seqNum), regsReady(_regsReady),
+              memDepReady(_memDepReady)
+        { }
+
+        InstSeqNum seqNum;
+        mutable bool regsReady;
+        mutable bool memDepReady;
+        mutable sd_it_t storeDep;
+    };
+
+    struct ltDependency {
+        bool operator() (const Dependency &lhs, const Dependency &rhs)
+        {
+            return lhs.seqNum < rhs.seqNum;
+        }
+    };
+
+
+  private:
+    inline void moveToReady(dep_it_t &woken_inst);
+
   private:
     /** List of instructions that have passed through rename, yet are still
-     *  waiting on a memory dependence to resolve before they can issue.
+     *  waiting on either a memory dependence to resolve or source registers to
+     *  become available before they can issue.
      */
-    std::set<InstSeqNum> renamedInsts;
+    std::set<Dependency, ltDependency> waitingInsts;
 
     /** List of instructions that have all their predicted memory dependences
-     *  resolved.  They are ready in terms of being free of memory
-     *  dependences; however they may still have to wait on source registers.
+     *  resolved and their source registers ready.
      */
     std::set<InstSeqNum> readyInsts;
 
-    std::map<InstSeqNum, vector<InstSeqNum> > dependencies;
+    // Change this to hold a vector of iterators, which will point to the
+    // entry of the waiting instructions.
+    /** List of stores' sequence numbers, each of which has a vector of
+     *  iterators.  The iterators point to the appropriate node within
+     *  waitingInsts that has the depenendent instruction.
+     */
+    std::map<InstSeqNum, vector<dep_it_t> > storeDependents;
+
+    // For now will implement this as a map...hash table might not be too
+    // bad, or could move to something that mimics the current dependency
+    // graph.
+    std::map<InstSeqNum, DynInstPtr> memInsts;
+
+    // Iterator pointer to the top instruction which has is ready.
+    // Is set by the top() call.
+    dyn_it_t topInst;
 
     /** The memory dependence predictor.  It is accessed upon new
      *  instructions being added to the IQ, and responds by telling
@@ -65,6 +131,10 @@ class MemDepUnit {
      */
     MemDepPred depPred;
 
+    Stats::Scalar<> insertedLoads;
+    Stats::Scalar<> insertedStores;
+    Stats::Scalar<> conflictingLoads;
+    Stats::Scalar<> conflictingStores;
 };
 
 #endif
diff --git a/cpu/beta_cpu/mem_dep_unit_impl.hh b/cpu/beta_cpu/mem_dep_unit_impl.hh
index 4299acb7a..4161ac2a8 100644
--- a/cpu/beta_cpu/mem_dep_unit_impl.hh
+++ b/cpu/beta_cpu/mem_dep_unit_impl.hh
@@ -3,108 +3,301 @@
 
 #include "cpu/beta_cpu/mem_dep_unit.hh"
 
-// Hack: dependence predictor sizes are hardcoded.
 template <class MemDepPred, class Impl>
 MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params &params)
-    : depPred(4028, 128)
+    : depPred(params.SSITSize, params.LFSTSize)
 {
     DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n");
 }
 
 template <class MemDepPred, class Impl>
 void
+MemDepUnit<MemDepPred, Impl>::regStats()
+{
+    insertedLoads
+        .name(name() + ".memDep.insertedLoads")
+        .desc("Number of loads inserted to the mem dependence unit.");
+
+    insertedStores
+        .name(name() + ".memDep.insertedStores")
+        .desc("Number of stores inserted to the mem dependence unit.");
+
+    conflictingLoads
+        .name(name() + ".memDep.conflictingLoads")
+        .desc("Number of conflicting loads.");
+
+    conflictingStores
+        .name(name() + ".memDep.conflictingStores")
+        .desc("Number of conflicting stores.");
+}
+
+template <class MemDepPred, class Impl>
+void
 MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
 {
     InstSeqNum inst_seq_num = inst->seqNum;
 
+    Dependency unresolved_dependencies(inst_seq_num);
 
     InstSeqNum producing_store = depPred.checkInst(inst->readPC());
 
     if (producing_store == 0 ||
-        dependencies.find(producing_store) == dependencies.end()) {
-        readyInsts.insert(inst_seq_num);
+        storeDependents.find(producing_store) == storeDependents.end()) {
+
+        DPRINTF(MemDepUnit, "MemDepUnit: No dependency for inst PC "
+                "%#x.\n", inst->readPC());
+
+        unresolved_dependencies.storeDep = storeDependents.end();
+
+        if (inst->readyToIssue()) {
+            readyInsts.insert(inst_seq_num);
+        } else {
+            unresolved_dependencies.memDepReady = true;
+
+            waitingInsts.insert(unresolved_dependencies);
+        }
     } else {
+        DPRINTF(MemDepUnit, "MemDepUnit: Adding to dependency list; "
+                "inst PC %#x is dependent on seq num %i.\n",
+                inst->readPC(), producing_store);
+
+        if (inst->readyToIssue()) {
+            unresolved_dependencies.regsReady = true;
+        }
+
+        // Find the store that this instruction is dependent on.
+        sd_it_t store_loc = storeDependents.find(producing_store);
+
+        assert(store_loc != storeDependents.end());
+
+        // Record the location of the store that this instruction is
+        // dependent on.
+        unresolved_dependencies.storeDep = store_loc;
+
         // If it's not already ready, then add it to the renamed
         // list and the dependencies.
-        renamedInsts.insert(inst_seq_num);
+        dep_it_t inst_loc =
+            (waitingInsts.insert(unresolved_dependencies)).first;
+
+        // Add this instruction to the list of dependents.
+        (*store_loc).second.push_back(inst_loc);
+
+        assert(!(*store_loc).second.empty());
 
-        dependencies[producing_store].push_back(inst_seq_num);
+        if (inst->isLoad()) {
+            ++conflictingLoads;
+        } else {
+            ++conflictingStores;
+        }
     }
 
     if (inst->isStore()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
+                inst->readPC());
+
         depPred.insertStore(inst->readPC(), inst_seq_num);
 
         // Make sure this store isn't already in this list.
-        assert(dependencies.find(inst_seq_num) == dependencies.end());
+        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
 
         // Put a dependency entry in at the store's sequence number.
         // Uh, not sure how this works...I want to create an entry but
         // I don't have anything to put into the value yet.
-        dependencies[inst_seq_num];
-    } else if (!inst->isLoad()) {
+        storeDependents[inst_seq_num];
+
+        assert(storeDependents.size() != 0);
+
+        ++insertedStores;
+
+    } else if (inst->isLoad()) {
+        ++insertedLoads;
+    } else {
+        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+    }
+
+    memInsts[inst_seq_num] = inst;
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
+{
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+    Dependency non_spec_inst(inst_seq_num);
+
+    non_spec_inst.storeDep = storeDependents.end();
+
+    waitingInsts.insert(non_spec_inst);
+
+    // Might want to turn this part into an inline function or something.
+    // It's shared between both insert functions.
+    if (inst->isStore()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
+                inst->readPC());
+
+        depPred.insertStore(inst->readPC(), inst_seq_num);
+
+        // Make sure this store isn't already in this list.
+        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
+
+        // Put a dependency entry in at the store's sequence number.
+        // Uh, not sure how this works...I want to create an entry but
+        // I don't have anything to put into the value yet.
+        storeDependents[inst_seq_num];
+
+        assert(storeDependents.size() != 0);
+
+        ++insertedStores;
+
+    } else if (inst->isLoad()) {
+        ++insertedLoads;
+    } else {
         panic("MemDepUnit: Unknown type! (most likely a barrier).");
     }
+
+    memInsts[inst_seq_num] = inst;
+}
+
+template <class MemDepPred, class Impl>
+typename Impl::DynInstPtr &
+MemDepUnit<MemDepPred, Impl>::top()
+{
+    topInst = memInsts.find( (*readyInsts.begin()) );
+
+    DPRINTF(MemDepUnit, "MemDepUnit: Top instruction is PC %#x.\n",
+            (*topInst).second->readPC());
+
+    return (*topInst).second;
 }
 
 template <class MemDepPred, class Impl>
-bool
-MemDepUnit<MemDepPred, Impl>::readyToIssue(DynInstPtr &inst)
+void
+MemDepUnit<MemDepPred, Impl>::pop()
 {
+    DPRINTF(MemDepUnit, "MemDepUnit: Removing instruction PC %#x.\n",
+            (*topInst).second->readPC());
+
+    wakeDependents((*topInst).second);
+
+    issue((*topInst).second);
+
+    memInsts.erase(topInst);
+
+    topInst = memInsts.end();
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Marking registers as ready for "
+            "instruction PC %#x.\n",
+            inst->readPC());
+
     InstSeqNum inst_seq_num = inst->seqNum;
 
-    if (readyInsts.find(inst_seq_num) == readyInsts.end()) {
-        return false;
+    Dependency inst_to_find(inst_seq_num);
+
+    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+
+    assert(waiting_inst != waitingInsts.end());
+
+    if ((*waiting_inst).memDepReady) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction has its memory "
+                "dependencies resolved, adding it to the ready list.\n");
+
+        moveToReady(waiting_inst);
     } else {
-        return true;
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction still waiting on "
+                "memory dependency.\n");
+
+        (*waiting_inst).regsReady = true;
     }
 }
 
 template <class MemDepPred, class Impl>
 void
+MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Marking non speculative "
+            "instruction PC %#x as ready.\n",
+            inst->readPC());
+
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+    Dependency inst_to_find(inst_seq_num);
+
+    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+
+    assert(waiting_inst != waitingInsts.end());
+
+    moveToReady(waiting_inst);
+}
+
+template <class MemDepPred, class Impl>
+void
 MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
 {
     assert(readyInsts.find(inst->seqNum) != readyInsts.end());
 
+    DPRINTF(MemDepUnit, "MemDepUnit: Issuing instruction PC %#x.\n",
+            inst->readPC());
+
     // Remove the instruction from the ready list.
     readyInsts.erase(inst->seqNum);
+
+    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
 }
 
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 {
+    // Only stores have dependents.
+    if (!inst->isStore()) {
+        return;
+    }
+
     // Wake any dependencies.
-    dep_it_t dep_it = dependencies.find(inst);
+    sd_it_t sd_it = storeDependents.find(inst->seqNum);
 
     // If there's no entry, then return.  Really there should only be
     // no entry if the instruction is a load.
-    if (dep_it == dependencies.end()) {
+    if (sd_it == storeDependents.end()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction PC %#x, sequence "
+                "number %i has no dependents.\n",
+                inst->readPC(), inst->seqNum);
+
         return;
     }
 
-    assert(inst->isStore());
-
-    for(int i = 0; i < (*dep_it).second.size(); ++i ) {
-        InstSeqNum woken_inst = (*dep_it).second[i];
+    for (int i = 0; i < (*sd_it).second.size(); ++i ) {
+        dep_it_t woken_inst = (*sd_it).second[i];
 
+        DPRINTF(MemDepUnit, "MemDepUnit: Waking up a dependent inst, "
+                "sequence number %i.\n",
+                (*woken_inst).seqNum);
+#if 0
         // Should we have reached instructions that are actually squashed,
         // there will be no more useful instructions in this dependency
         // list.  Break out early.
-        if (renamedInsts.find(woken_inst) == renamedInsts.end()) {
+        if (waitingInsts.find(woken_inst) == waitingInsts.end()) {
             DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x "
                     "are squashed, starting at SN %i.  Breaking early.\n",
                     inst->readPC(), woken_inst);
             break;
         }
+#endif
 
-        // Remove it from the renamed instructions.
-        renamedInsts.erase(woken_inst);
-
-        // Add it to the ready list.
-        readyInsts.insert(woken_inst);
+        if ((*woken_inst).regsReady) {
+            moveToReady(woken_inst);
+        } else {
+            (*woken_inst).memDepReady = true;
+        }
     }
 
-    dependencies.erase(dep_it);
+    storeDependents.erase(sd_it);
 }
 
 template <class MemDepPred, class Impl>
@@ -112,17 +305,30 @@ void
 MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
 {
 
-    if (!renamedInsts.empty()) {
-        sn_it_t renamed_it = renamedInsts.end();
+    if (!waitingInsts.empty()) {
+        dep_it_t waiting_it = waitingInsts.end();
 
-        --renamed_it;
+        --waiting_it;
 
         // Remove entries from the renamed list as long as we haven't reached
         // the end and the entries continue to be younger than the squashed.
-        while (!renamedInsts.empty() &&
-               (*renamed_it) > squashed_num)
+        while (!waitingInsts.empty() &&
+               (*waiting_it).seqNum > squashed_num)
         {
-            renamedInsts.erase(renamed_it--);
+            if (!(*waiting_it).memDepReady &&
+                (*waiting_it).storeDep != storeDependents.end()) {
+                sd_it_t sd_it = (*waiting_it).storeDep;
+
+                // Make sure the iterator that the store has pointing
+                // back is actually to this instruction.
+                assert((*sd_it).second.back() == waiting_it);
+
+                // Now remove this from the store's list of dependent
+                // instructions.
+                (*sd_it).second.pop_back();
+            }
+
+            waitingInsts.erase(waiting_it--);
         }
     }
 
@@ -139,16 +345,19 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
         }
     }
 
-    if (!dependencies.empty()) {
-        dep_it_t dep_it = dependencies.end();
+    if (!storeDependents.empty()) {
+        sd_it_t dep_it = storeDependents.end();
 
         --dep_it;
 
         // Same for the dependencies list.
-        while (!dependencies.empty() &&
+        while (!storeDependents.empty() &&
                (*dep_it).first > squashed_num)
         {
-            dependencies.erase(dep_it--);
+            // This store's list of dependent instructions should be empty.
+            assert((*dep_it).second.empty());
+
+            storeDependents.erase(dep_it--);
         }
     }
 
@@ -161,6 +370,23 @@ void
 MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
                                         DynInstPtr &violating_load)
 {
+    DPRINTF(MemDepUnit, "MemDepUnit: Passing violating PCs to store sets,"
+            " load: %#x, store: %#x\n", violating_load->readPC(),
+            store_inst->readPC());
     // Tell the memory dependence unit of the violation.
     depPred.violation(violating_load->readPC(), store_inst->readPC());
 }
+
+template <class MemDepPred, class Impl>
+inline void
+MemDepUnit<MemDepPred, Impl>::moveToReady(dep_it_t &woken_inst)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Adding instruction sequence number %i "
+            "to the ready list.\n", (*woken_inst).seqNum);
+
+    // Add it to the ready list.
+    readyInsts.insert((*woken_inst).seqNum);
+
+    // Remove it from the waiting instructions.
+    waitingInsts.erase(woken_inst);
+}
diff --git a/cpu/beta_cpu/ras.cc b/cpu/beta_cpu/ras.cc
new file mode 100644
index 000000000..ca05f5a0d
--- /dev/null
+++ b/cpu/beta_cpu/ras.cc
@@ -0,0 +1,42 @@
+#include "cpu/beta_cpu/ras.hh"
+
+ReturnAddrStack::ReturnAddrStack(unsigned _numEntries)
+    : numEntries(_numEntries), usedEntries(0),
+      tos(0)
+{
+    addrStack = new Addr[numEntries](0);
+}
+
+void
+ReturnAddrStack::push(const Addr &return_addr)
+{
+    incrTos();
+
+    addrStack[tos] = return_addr;
+
+    if (usedEntries != numEntries) {
+        ++usedEntries;
+    }
+}
+
+void
+ReturnAddrStack::pop()
+{
+    // Not sure it's possible to really track usedEntries properly.
+//    assert(usedEntries > 0);
+
+    if (usedEntries > 0) {
+        --usedEntries;
+    }
+
+    decrTos();
+}
+
+void
+ReturnAddrStack::restore(unsigned top_entry_idx,
+                         const Addr &restored_target)
+{
+    tos = top_entry_idx;
+
+    addrStack[tos] = restored_target;
+}
diff --git a/cpu/beta_cpu/ras.hh b/cpu/beta_cpu/ras.hh
new file mode 100644
index 000000000..7666f825f
--- /dev/null
+++ b/cpu/beta_cpu/ras.hh
@@ -0,0 +1,40 @@
+#ifndef __RAS_HH__
+#define __RAS_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class ReturnAddrStack
+{
+  public:
+    ReturnAddrStack(unsigned numEntries);
+
+    Addr top()
+    { return addrStack[tos]; }
+
+    unsigned topIdx()
+    { return tos; }
+
+    void push(const Addr &return_addr);
+
+    void pop();
+
+    void restore(unsigned top_entry_idx, const Addr &restored_target);
+
+  private:
+    inline void incrTos()
+    { tos = (tos + 1) % numEntries; }
+
+    inline void decrTos()
+    { tos = (tos == 0 ? numEntries - 1 : tos - 1); }
+
+    Addr *addrStack;
+
+    unsigned numEntries;
+
+    unsigned usedEntries;
+
+    unsigned tos;
+};
+
+#endif // __RAS_HH__
diff --git a/cpu/beta_cpu/regfile.hh b/cpu/beta_cpu/regfile.hh
index aba897fdc..148d9408a 100644
--- a/cpu/beta_cpu/regfile.hh
+++ b/cpu/beta_cpu/regfile.hh
@@ -54,7 +54,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Access to float register %i as single, has "
                 "data %8.8f\n", int(reg_idx), (float)floatRegFile[reg_idx].d);
@@ -67,7 +67,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Access to float register %i as double, has "
                 " data %8.8f\n", int(reg_idx), floatRegFile[reg_idx].d);
@@ -80,7 +80,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Access to float register %i as int, has data "
                 "%lli\n", int(reg_idx), floatRegFile[reg_idx].q);
@@ -103,7 +103,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
@@ -116,7 +116,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
@@ -129,7 +129,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n",
                 int(reg_idx), val);
diff --git a/cpu/beta_cpu/rename.hh b/cpu/beta_cpu/rename.hh
index 9f031012a..3e6b873ae 100644
--- a/cpu/beta_cpu/rename.hh
+++ b/cpu/beta_cpu/rename.hh
@@ -54,6 +54,8 @@ class SimpleRename
   public:
     SimpleRename(Params &params);
 
+    void regStats();
+
     void setCPU(FullCPU *cpu_ptr);
 
     void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@@ -182,6 +184,22 @@ class SimpleRename
      *  group of instructions, it can restart at the proper instruction.
      */
     unsigned numInst;
+
+    Stats::Scalar<> renameSquashCycles;
+    Stats::Scalar<> renameIdleCycles;
+    Stats::Scalar<> renameBlockCycles;
+    Stats::Scalar<> renameUnblockCycles;
+    Stats::Scalar<> renameRenamedInsts;
+    Stats::Scalar<> renameSquashedInsts;
+    Stats::Scalar<> renameROBFullEvents;
+    Stats::Scalar<> renameIQFullEvents;
+    Stats::Scalar<> renameFullRegistersEvents;
+    Stats::Scalar<> renameRenamedOperands;
+    Stats::Scalar<> renameRenameLookups;
+    Stats::Scalar<> renameHBPlaceHolders;
+    Stats::Scalar<> renameCommittedMaps;
+    Stats::Scalar<> renameUndoneMaps;
+    Stats::Scalar<> renameValidUndoneMaps;
 };
 
 #endif // __SIMPLE_RENAME_HH__
diff --git a/cpu/beta_cpu/rename_impl.hh b/cpu/beta_cpu/rename_impl.hh
index 47464d961..5a8e499e9 100644
--- a/cpu/beta_cpu/rename_impl.hh
+++ b/cpu/beta_cpu/rename_impl.hh
@@ -16,6 +16,72 @@ SimpleRename<Impl>::SimpleRename(Params &params)
 
 template <class Impl>
 void
+SimpleRename<Impl>::regStats()
+{
+    renameSquashCycles
+        .name(name() + ".renameSquashCycles")
+        .desc("Number of cycles rename is squashing")
+        .prereq(renameSquashCycles);
+    renameIdleCycles
+        .name(name() + ".renameIdleCycles")
+        .desc("Number of cycles rename is idle")
+        .prereq(renameIdleCycles);
+    renameBlockCycles
+        .name(name() + ".renameBlockCycles")
+        .desc("Number of cycles rename is blocking")
+        .prereq(renameBlockCycles);
+    renameUnblockCycles
+        .name(name() + ".renameUnblockCycles")
+        .desc("Number of cycles rename is unblocking")
+        .prereq(renameUnblockCycles);
+    renameRenamedInsts
+        .name(name() + ".renameRenamedInsts")
+        .desc("Number of instructions processed by rename")
+        .prereq(renameRenamedInsts);
+    renameSquashedInsts
+        .name(name() + ".renameSquashedInsts")
+        .desc("Number of squashed instructions processed by rename")
+        .prereq(renameSquashedInsts);
+    renameROBFullEvents
+        .name(name() + ".renameROBFullEvents")
+        .desc("Number of times rename has considered the ROB 'full'")
+        .prereq(renameROBFullEvents);
+    renameIQFullEvents
+        .name(name() + ".renameIQFullEvents")
+        .desc("Number of times rename has considered the IQ 'full'")
+        .prereq(renameIQFullEvents);
+    renameFullRegistersEvents
+        .name(name() + ".renameFullRegisterEvents")
+        .desc("Number of times there has been no free registers")
+        .prereq(renameFullRegistersEvents);
+    renameRenamedOperands
+        .name(name() + ".renameRenamedOperands")
+        .desc("Number of destination operands rename has renamed")
+        .prereq(renameRenamedOperands);
+    renameRenameLookups
+        .name(name() + ".renameRenameLookups")
+        .desc("Number of register rename lookups that rename has made")
+        .prereq(renameRenameLookups);
+    renameHBPlaceHolders
+        .name(name() + ".renameHBPlaceHolders")
+        .desc("Number of place holders added to the history buffer")
+        .prereq(renameHBPlaceHolders);
+    renameCommittedMaps
+        .name(name() + ".renameCommittedMaps")
+        .desc("Number of HB maps that are committed")
+        .prereq(renameCommittedMaps);
+    renameUndoneMaps
+        .name(name() + ".renameUndoneMaps")
+        .desc("Number of HB maps that are undone due to squashing")
+        .prereq(renameUndoneMaps);
+    renameValidUndoneMaps
+        .name(name() + ".renameValidUndoneMaps")
+        .desc("Number of HB maps that are undone, and are not place holders")
+        .prereq(renameValidUndoneMaps);
+}
+
+template <class Impl>
+void
 SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     DPRINTF(Rename, "Rename: Setting CPU pointer.\n");
@@ -59,7 +125,6 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
     // Setup wire to get information from decode.
     fromDecode = decodeQueue->getWire(-decodeToRenameDelay);
-
 }
 
 template <class Impl>
@@ -124,7 +189,7 @@ SimpleRename<Impl>::unblock()
     // continue to tell previous stages to stall.  They will be
     // able to restart once the skid buffer is empty.
     if (!skidBuffer.empty()) {
-                toDecode->renameInfo.stall = true;
+        toDecode->renameInfo.stall = true;
     } else {
         DPRINTF(Rename, "Rename: Done unblocking.\n");
         _status = Running;
@@ -136,7 +201,6 @@ void
 SimpleRename<Impl>::doSquash()
 {
     typename list<RenameHistory>::iterator hb_it = historyBuffer.begin();
-//    typename list<RenameHistory>::iterator delete_it;
 
     InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum;
 
@@ -154,6 +218,8 @@ SimpleRename<Impl>::doSquash()
     // they did and freeing up the registers.
     while ((*hb_it).instSeqNum > squashed_seq_num)
     {
+        assert(hb_it != historyBuffer.end());
+
         DPRINTF(Rename, "Rename: Removing history entry with sequence "
                 "number %i.\n", (*hb_it).instSeqNum);
 
@@ -165,15 +231,13 @@ SimpleRename<Impl>::doSquash()
 
             // Put the renamed physical register back on the free list.
             freeList->addReg(hb_it->newPhysReg);
-        }
 
-//        delete_it = hb_it;
-
-//        hb_it++;
+            ++renameValidUndoneMaps;
+        }
 
         historyBuffer.erase(hb_it++);
 
-        assert(hb_it != historyBuffer.end());
+        ++renameUndoneMaps;
     }
 }
 
@@ -196,9 +260,6 @@ SimpleRename<Impl>::squash()
     doSquash();
 }
 
-// In the future, when a SmartPtr is used for DynInst, then this function
-// itself can handle returning the instruction's physical registers to
-// the free list.
 template<class Impl>
 void
 SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
@@ -233,19 +294,20 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
 
         if (!(*hb_it).placeHolder) {
             freeList->addReg((*hb_it).prevPhysReg);
+            ++renameCommittedMaps;
         }
 
         historyBuffer.erase(hb_it--);
     }
 
-    // Finally free up the previous register of the squashed instruction
+    // Finally free up the previous register of the finished instruction
     // itself.
     if (!(*hb_it).placeHolder) {
         freeList->addReg(hb_it->prevPhysReg);
+        ++renameCommittedMaps;
     }
 
     historyBuffer.erase(hb_it);
-
 }
 
 template <class Impl>
@@ -263,7 +325,7 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
 
         // Look up the source registers to get the phys. register they've
         // been renamed to, and set the sources to those registers.
-        RegIndex renamed_reg = renameMap->lookup(src_reg);
+        PhysRegIndex renamed_reg = renameMap->lookup(src_reg);
 
         DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
                 "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
@@ -278,6 +340,8 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
 
             inst->markSrcRegReady(src_idx);
         }
+
+        ++renameRenameLookups;
     }
 }
 
@@ -289,40 +353,6 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
 
     unsigned num_dest_regs = inst->numDestRegs();
 
-    // Rename the destination registers.
-    for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
-    {
-        RegIndex dest_reg = inst->destRegIdx(dest_idx);
-
-        // Get the physical register that the destination will be
-        // renamed to.
-        rename_result = renameMap->rename(dest_reg);
-
-        DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
-                "reg %i.\n", (int)dest_reg,
-                (int)rename_result.first);
-
-        // Record the rename information so that a history can be kept.
-        RenameHistory hb_entry(inst->seqNum, dest_reg,
-                               rename_result.first,
-                               rename_result.second);
-
-        historyBuffer.push_front(hb_entry);
-
-        DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
-                "sequence number %lli.\n",
-                (*historyBuffer.begin()).instSeqNum);
-
-        // Tell the instruction to rename the appropriate destination
-        // register (dest_idx) to the new physical register
-        // (rename_result.first), and record the previous physical
-        // register that the same logical register was renamed to
-        // (rename_result.second).
-        inst->renameDestReg(dest_idx,
-                            rename_result.first,
-                            rename_result.second);
-    }
-
     // If it's an instruction with no destination registers, then put
     // a placeholder within the history buffer.  It might be better
     // to not put it in the history buffer at all (other than branches,
@@ -337,6 +367,45 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
         DPRINTF(Rename, "Rename: Adding placeholder instruction to "
                 "history buffer, sequence number %lli.\n",
                 inst->seqNum);
+
+        ++renameHBPlaceHolders;
+    } else {
+
+        // Rename the destination registers.
+        for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
+        {
+            RegIndex dest_reg = inst->destRegIdx(dest_idx);
+
+            // Get the physical register that the destination will be
+            // renamed to.
+            rename_result = renameMap->rename(dest_reg);
+
+            DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
+                    "reg %i.\n", (int)dest_reg,
+                    (int)rename_result.first);
+
+            // Record the rename information so that a history can be kept.
+            RenameHistory hb_entry(inst->seqNum, dest_reg,
+                                   rename_result.first,
+                                   rename_result.second);
+
+            historyBuffer.push_front(hb_entry);
+
+            DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
+                    "sequence number %lli.\n",
+                    (*historyBuffer.begin()).instSeqNum);
+
+            // Tell the instruction to rename the appropriate destination
+            // register (dest_idx) to the new physical register
+            // (rename_result.first), and record the previous physical
+            // register that the same logical register was renamed to
+            // (rename_result.second).
+            inst->renameDestReg(dest_idx,
+                                rename_result.first,
+                                rename_result.second);
+
+            ++renameRenamedOperands;
+        }
     }
 }
 
@@ -379,6 +448,8 @@ SimpleRename<Impl>::tick()
         // buffer were used.  Remove those instructions and handle
         // the rest of unblocking.
         if (_status == Unblocking) {
+            ++renameUnblockCycles;
+
             if (fromDecode->size > 0) {
                 // Add the current inputs onto the skid buffer, so they can be
                 // reprocessed when this stage unblocks.
@@ -388,6 +459,8 @@ SimpleRename<Impl>::tick()
             unblock();
         }
     } else if (_status == Blocked) {
+        ++renameBlockCycles;
+
         // If stage is blocked and still receiving valid instructions,
         // make sure to store them in the skid buffer.
         if (fromDecode->size > 0) {
@@ -425,6 +498,8 @@ SimpleRename<Impl>::tick()
             return;
         }
     } else if (_status == Squashing) {
+        ++renameSquashCycles;
+
         if (fromCommit->commitInfo.squash) {
             squash();
         } else if (!fromCommit->commitInfo.squash &&
@@ -439,7 +514,13 @@ SimpleRename<Impl>::tick()
 
     // Ugly code, revamp all of the tick() functions eventually.
     if (fromCommit->commitInfo.doneSeqNum != 0 && _status != Squashing) {
+#ifndef FULL_SYSTEM
+        if (!fromCommit->commitInfo.squash) {
+            removeFromHistory(fromCommit->commitInfo.doneSeqNum);
+        }
+#else
         removeFromHistory(fromCommit->commitInfo.doneSeqNum);
+#endif
     }
 
     // Perhaps put this outside of this function, since this will
@@ -539,6 +620,12 @@ SimpleRename<Impl>::rename()
         // Tell previous stage to stall.
         toDecode->renameInfo.stall = true;
 
+        if (free_rob_entries <= 0) {
+            ++renameROBFullEvents;
+        } else {
+            ++renameIQFullEvents;
+        }
+
         return;
     } else if (min_iq_rob < insts_available) {
         DPRINTF(Rename, "Rename: Will have to block this cycle.  Only "
@@ -548,6 +635,12 @@ SimpleRename<Impl>::rename()
         insts_available = min_iq_rob;
 
         block_this_cycle = true;
+
+        if (free_rob_entries < free_iq_entries) {
+            ++renameROBFullEvents;
+        } else {
+            ++renameIQFullEvents;
+        }
     }
 
     while (insts_available > 0) {
@@ -566,6 +659,8 @@ SimpleRename<Impl>::rename()
             // Go to the next instruction.
             ++numInst;
 
+            ++renameSquashedInsts;
+
             // Decrement how many instructions are available.
             --insts_available;
 
@@ -606,6 +701,8 @@ SimpleRename<Impl>::rename()
 
             block_this_cycle = true;
 
+            ++renameFullRegistersEvents;
+
             break;
         }
 
@@ -625,6 +722,8 @@ SimpleRename<Impl>::rename()
         ++to_iew_index;
         ++numInst;
 
+        ++renameRenamedInsts;
+
         // Decrement how many instructions are available.
         --insts_available;
     }
diff --git a/cpu/beta_cpu/rename_map.cc b/cpu/beta_cpu/rename_map.cc
index cb9720d28..1301202f2 100644
--- a/cpu/beta_cpu/rename_map.cc
+++ b/cpu/beta_cpu/rename_map.cc
@@ -72,7 +72,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
         floatRenameMap[index].physical_reg = float_reg_idx++;
     }
 
-    for (RegIndex index = numPhysicalIntRegs;
+    for (PhysRegIndex index = numPhysicalIntRegs;
          index < numPhysicalIntRegs + numLogicalFloatRegs; ++index)
     {
         floatScoreboard[index] = 1;
@@ -88,7 +88,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
     }
 
     // Initialize the entries in the misc register scoreboard to be ready.
-    for (RegIndex index = numPhysicalRegs;
+    for (PhysRegIndex index = numPhysicalRegs;
          index < numPhysicalRegs + numMiscRegs; ++index)
     {
         miscScoreboard[index] = 1;
diff --git a/cpu/beta_cpu/rob_impl.hh b/cpu/beta_cpu/rob_impl.hh
index 862008429..86c4e2db1 100644
--- a/cpu/beta_cpu/rob_impl.hh
+++ b/cpu/beta_cpu/rob_impl.hh
@@ -139,9 +139,7 @@ bool
 ROB<Impl>::isHeadReady()
 {
     if (numInstsInROB != 0) {
-        DynInstPtr head_inst = cpu->instList.front();
-
-        return head_inst->readyToCommit();
+        return cpu->instList.front()->readyToCommit();
     }
 
     return false;
diff --git a/cpu/beta_cpu/store_set.cc b/cpu/beta_cpu/store_set.cc
index 46d763d37..a5458685d 100644
--- a/cpu/beta_cpu/store_set.cc
+++ b/cpu/beta_cpu/store_set.cc
@@ -5,6 +5,8 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
     : SSIT_size(_SSIT_size), LFST_size(_LFST_size)
 {
     DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
+    DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
+            SSIT_size, LFST_size);
 
     SSIT = new SSID[SSIT_size];
 
@@ -31,11 +33,13 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
 }
 
 void
-StoreSet::violation(Addr load_PC, Addr store_PC)
+StoreSet::violation(Addr store_PC, Addr load_PC)
 {
     int load_index = calcIndex(load_PC);
     int store_index = calcIndex(store_PC);
 
+    assert(load_index < SSIT_size && store_index < SSIT_size);
+
     bool valid_load_SSID = validSSIT[load_index];
     bool valid_store_SSID = validSSIT[store_index];
 
@@ -51,7 +55,14 @@ StoreSet::violation(Addr load_PC, Addr store_PC)
 
         SSIT[store_index] = new_set;
 
+        assert(new_set < LFST_size);
+
         SSCounters[new_set]++;
+
+
+        DPRINTF(StoreSet, "StoreSet: Neither load nor store had a valid "
+                "storeset, creating a new one: %i for load %#x, store %#x\n",
+                new_set, load_PC, store_PC);
     } else if (valid_load_SSID && !valid_store_SSID) {
         SSID load_SSID = SSIT[load_index];
 
@@ -59,7 +70,13 @@ StoreSet::violation(Addr load_PC, Addr store_PC)
 
         SSIT[store_index] = load_SSID;
 
+        assert(load_SSID < LFST_size);
+
         SSCounters[load_SSID]++;
+
+        DPRINTF(StoreSet, "StoreSet: Load had a valid store set.  Adding "
+                "store to that set: %i for load %#x, store %#x\n",
+                load_SSID, load_PC, store_PC);
     } else if (!valid_load_SSID && valid_store_SSID) {
         SSID store_SSID = SSIT[store_index];
 
@@ -69,10 +86,16 @@ StoreSet::violation(Addr load_PC, Addr store_PC)
 
         // Because we are having a load point to an already existing set,
         // the size of the store set is not incremented.
+
+        DPRINTF(StoreSet, "StoreSet: Store had a valid store set: %i for "
+                "load %#x, store %#x\n",
+                store_SSID, load_PC, store_PC);
     } else {
         SSID load_SSID = SSIT[load_index];
         SSID store_SSID = SSIT[store_index];
 
+        assert(load_SSID < LFST_size && store_SSID < LFST_size);
+
         int load_SS_size = SSCounters[load_SSID];
         int store_SS_size = SSCounters[store_SSID];
 
@@ -83,11 +106,19 @@ StoreSet::violation(Addr load_PC, Addr store_PC)
 
             SSCounters[load_SSID]++;
             SSCounters[store_SSID]--;
+
+            DPRINTF(StoreSet, "StoreSet: Load had bigger store set: %i; "
+                    "for load %#x, store %#x\n",
+                    load_SSID, load_PC, store_PC);
         } else {
             SSIT[load_index] = store_SSID;
 
             SSCounters[store_SSID]++;
             SSCounters[load_SSID]--;
+
+            DPRINTF(StoreSet, "StoreSet: Store had bigger store set: %i; "
+                    "for load %#x, store %#x\n",
+                    store_SSID, load_PC, store_PC);
         }
     }
 }
@@ -106,6 +137,8 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
 
     int store_SSID;
 
+    assert(index < SSIT_size);
+
     if (!validSSIT[index]) {
         // Do nothing if there's no valid entry.
         return;
@@ -116,6 +149,11 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
 
         // Update the last store that was fetched with the current one.
         LFST[store_SSID] = store_seq_num;
+
+        validLFST[store_SSID] = 1;
+
+        DPRINTF(StoreSet, "Store %#x updated the LFST, SSID: %i\n",
+                store_PC, store_SSID);
     }
 }
 
@@ -126,7 +164,12 @@ StoreSet::checkInst(Addr PC)
 
     int inst_SSID;
 
+    assert(index < SSIT_size);
+
     if (!validSSIT[index]) {
+        DPRINTF(StoreSet, "Inst %#x with index %i had no SSID\n",
+                PC, index);
+
         // Return 0 if there's no valid entry.
         return 0;
     } else {
@@ -135,8 +178,15 @@ StoreSet::checkInst(Addr PC)
         assert(inst_SSID < LFST_size);
 
         if (!validLFST[inst_SSID]) {
+
+            DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had no "
+                    "dependency\n", PC, index, inst_SSID);
+
             return 0;
         } else {
+            DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had LFST "
+                    "inum of %i\n", PC, index, inst_SSID, LFST[inst_SSID]);
+
             return LFST[inst_SSID];
         }
     }
@@ -154,14 +204,21 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 
     int store_SSID;
 
+    assert(index < SSIT_size);
+
     // Make sure the SSIT still has a valid entry for the issued store.
-    assert(validSSIT[index]);
+    if (!validSSIT[index]) {
+        return;
+    }
 
     store_SSID = SSIT[index];
 
+    assert(store_SSID < LFST_size);
+
     // If the last fetched store in the store set refers to the store that
     // was just issued, then invalidate the entry.
     if (validLFST[store_SSID] && LFST[store_SSID] == issued_seq_num) {
+        DPRINTF(StoreSet, "StoreSet: store invalidated itself in LFST.\n");
         validLFST[store_SSID] = false;
     }
 }
@@ -170,9 +227,14 @@ void
 StoreSet::squash(InstSeqNum squashed_num)
 {
     // Not really sure how to do this well.
+    // Generally this is small enough that it should be okay; short circuit
+    // evaluation should take care of invalid entries.
+
+    DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
+            squashed_num);
 
     for (int i = 0; i < LFST_size; ++i) {
-        if (LFST[i] < squashed_num) {
+        if (validLFST[i] && LFST[i] < squashed_num) {
             validLFST[i] = false;
         }
     }
diff --git a/cpu/beta_cpu/store_set.hh b/cpu/beta_cpu/store_set.hh
index 701c60a2d..b634a180d 100644
--- a/cpu/beta_cpu/store_set.hh
+++ b/cpu/beta_cpu/store_set.hh
@@ -14,7 +14,7 @@ class StoreSet
   public:
     StoreSet(int SSIT_size, int LFST_size);
 
-    void violation(Addr load_PC, Addr store_PC);
+    void violation(Addr store_PC, Addr load_PC);
 
     void insertLoad(Addr load_PC, InstSeqNum load_seq_num);
 
diff --git a/cpu/beta_cpu/tournament_pred.cc b/cpu/beta_cpu/tournament_pred.cc
new file mode 100644
index 000000000..53a11326a
--- /dev/null
+++ b/cpu/beta_cpu/tournament_pred.cc
@@ -0,0 +1,243 @@
+#include "cpu/beta_cpu/tournament_pred.hh"
+
+TournamentBP::SatCounter::SatCounter(unsigned bits)
+    : maxVal((1 << bits) - 1), counter(0)
+{
+}
+
+TournamentBP::SatCounter::SatCounter(unsigned bits, unsigned initial_val)
+    : maxVal((1 << bits) - 1), counter(initial_val)
+{
+    // Check to make sure initial value doesn't exceed the max counter value.
+    if (initial_val > maxVal) {
+        panic("BP: Initial counter value exceeds max size.");
+    }
+}
+
+void
+TournamentBP::SatCounter::increment()
+{
+    if (counter < maxVal) {
+        ++counter;
+    }
+}
+
+void
+TournamentBP::SatCounter::decrement()
+{
+    if (counter > 0) {
+        --counter;
+    }
+}
+
+TournamentBP::TournamentBP(unsigned _local_predictor_size,
+                           unsigned _local_ctr_bits,
+                           unsigned _local_history_table_size,
+                           unsigned _local_history_bits,
+                           unsigned _global_predictor_size,
+                           unsigned _global_ctr_bits,
+                           unsigned _global_history_bits,
+                           unsigned _choice_predictor_size,
+                           unsigned _choice_ctr_bits,
+                           unsigned _instShiftAmt)
+    : local_predictor_size(_local_predictor_size),
+      local_ctr_bits(_local_ctr_bits),
+      local_history_table_size(_local_history_table_size),
+      local_history_bits(_local_history_bits),
+      global_predictor_size(_global_predictor_size),
+      global_ctr_bits(_global_ctr_bits),
+      global_history_bits(_global_history_bits),
+      choice_predictor_size(_global_predictor_size),
+      choice_ctr_bits(_choice_ctr_bits),
+      instShiftAmt(_instShiftAmt)
+{
+    //Should do checks here to make sure sizes are correct (powers of 2)
+
+    //Setup the array of counters for the local predictor
+    local_ctrs = new SatCounter[local_predictor_size](local_ctr_bits);
+    //Setup the history table for the local table
+    local_history_table = new unsigned[local_history_table_size](0);
+    // Setup the local history mask
+    localHistoryMask = (1 << local_history_bits) - 1;
+
+    //Setup the array of counters for the global predictor
+    global_ctrs = new SatCounter[global_predictor_size](global_ctr_bits);
+    //Clear the global history
+    global_history = 0;
+    // Setup the global history mask
+    globalHistoryMask = (1 << global_history_bits) - 1;
+
+    //Setup the array of counters for the choice predictor
+    choice_ctrs = new SatCounter[choice_predictor_size](choice_ctr_bits);
+
+    threshold = (1 << (local_ctr_bits - 1)) - 1;
+    threshold = threshold / 2;
+}
+
+inline
+unsigned
+TournamentBP::calcLocHistIdx(Addr &branch_addr)
+{
+    return (branch_addr >> instShiftAmt) & (local_history_table_size - 1);
+}
+
+inline
+void
+TournamentBP::updateHistoriesTaken(unsigned local_history_idx)
+{
+    global_history = (global_history << 1) | 1;
+    global_history = global_history & globalHistoryMask;
+
+    local_history_table[local_history_idx] =
+        (local_history_table[local_history_idx] << 1) | 1;
+}
+
+inline
+void
+TournamentBP::updateHistoriesNotTaken(unsigned local_history_idx)
+{
+    global_history = (global_history << 1);
+    global_history = global_history & globalHistoryMask;
+
+    local_history_table[local_history_idx] =
+        (local_history_table[local_history_idx] << 1);
+}
+
+bool
+TournamentBP::lookup(Addr &branch_addr)
+{
+    uint8_t local_prediction;
+    unsigned local_history_idx;
+    unsigned local_predictor_idx;
+
+    uint8_t global_prediction;
+    uint8_t choice_prediction;
+
+    //Lookup in the local predictor to get its branch prediction
+    local_history_idx = calcLocHistIdx(branch_addr);
+    local_predictor_idx = local_history_table[local_history_idx]
+        & localHistoryMask;
+    local_prediction = local_ctrs[local_predictor_idx].read();
+
+    //Lookup in the global predictor to get its branch prediction
+    global_prediction = global_ctrs[global_history].read();
+
+    //Lookup in the choice predictor to see which one to use
+    choice_prediction = choice_ctrs[global_history].read();
+
+    //@todo Put a threshold value in for the three predictors that can
+    // be set through the constructor (so this isn't hard coded).
+    //Also should put some of this code into functions.
+    if (choice_prediction > threshold) {
+        if (global_prediction > threshold) {
+            updateHistoriesTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].increment();
+            local_ctrs[local_history_idx].increment();
+
+            return true;
+        } else {
+            updateHistoriesNotTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].decrement();
+            local_ctrs[local_history_idx].decrement();
+
+            return false;
+        }
+    } else {
+        if (local_prediction > threshold) {
+            updateHistoriesTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].increment();
+            local_ctrs[local_history_idx].increment();
+
+            return true;
+        } else {
+            updateHistoriesNotTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].decrement();
+            local_ctrs[local_history_idx].decrement();
+
+            return false;
+        }
+    }
+}
+
+// Update the branch predictor if it predicted a branch wrong.
+void
+TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken)
+{
+
+    uint8_t local_prediction;
+    unsigned local_history_idx;
+    unsigned local_predictor_idx;
+    bool local_pred_taken;
+
+    uint8_t global_prediction;
+    bool global_pred_taken;
+
+    // Load the correct global history into the register.
+    global_history = correct_gh;
+
+    // Get the local predictor's current prediction, remove the incorrect
+    // update, and update the local predictor
+    local_history_idx = calcLocHistIdx(branch_addr);
+    local_predictor_idx = local_history_table[local_history_idx];
+    local_predictor_idx = (local_predictor_idx >> 1) & localHistoryMask;
+
+    local_prediction = local_ctrs[local_predictor_idx].read();
+    local_pred_taken = local_prediction > threshold;
+
+    //Get the global predictor's current prediction, and update the
+    //global predictor
+    global_prediction = global_ctrs[global_history].read();
+    global_pred_taken = global_prediction > threshold;
+
+    //Update the choice predictor to tell it which one was correct
+    if (local_pred_taken != global_pred_taken) {
+        //If the local prediction matches the actual outcome, decerement
+        //the counter.  Otherwise increment the counter.
+        if (local_pred_taken == taken) {
+            choice_ctrs[global_history].decrement();
+        } else {
+            choice_ctrs[global_history].increment();
+        }
+    }
+
+    if (taken) {
+        assert(global_history < global_predictor_size &&
+               local_predictor_idx < local_predictor_size);
+
+        local_ctrs[local_predictor_idx].increment();
+        global_ctrs[global_history].increment();
+
+        global_history = (global_history << 1) | 1;
+        global_history = global_history & globalHistoryMask;
+
+        local_history_table[local_history_idx] |= 1;
+    }
+    else {
+        assert(global_history < global_predictor_size &&
+               local_predictor_idx < local_predictor_size);
+
+        local_ctrs[local_predictor_idx].decrement();
+        global_ctrs[global_history].decrement();
+
+        global_history = (global_history << 1);
+        global_history = global_history & globalHistoryMask;
+
+        local_history_table[local_history_idx] &= ~1;
+    }
+}
diff --git a/cpu/beta_cpu/tournament_pred.hh b/cpu/beta_cpu/tournament_pred.hh
new file mode 100644
index 000000000..bf87d753b
--- /dev/null
+++ b/cpu/beta_cpu/tournament_pred.hh
@@ -0,0 +1,160 @@
+#ifndef __TOURNAMENT_PRED_HH__
+#define __TOURNAMENT_PRED_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class TournamentBP
+{
+  public:
+    /**
+     * Default branch predictor constructor.
+     */
+    TournamentBP(unsigned local_predictor_size,
+                 unsigned local_ctr_bits,
+                 unsigned local_history_table_size,
+                 unsigned local_history_bits,
+                 unsigned global_predictor_size,
+                 unsigned global_history_bits,
+                 unsigned global_ctr_bits,
+                 unsigned choice_predictor_size,
+                 unsigned choice_ctr_bits,
+                 unsigned instShiftAmt);
+
+    /**
+     * Looks up the given address in the branch predictor and returns
+     * a true/false value as to whether it is taken.
+     * @param branch_addr The address of the branch to look up.
+     * @return Whether or not the branch is taken.
+     */
+    bool lookup(Addr &branch_addr);
+
+    /**
+     * Updates the branch predictor with the actual result of a branch.
+     * @param branch_addr The address of the branch to update.
+     * @param taken Whether or not the branch was taken.
+     */
+    void update(Addr &branch_addr, unsigned global_history, bool taken);
+
+    inline unsigned readGlobalHist() { return global_history; }
+
+  private:
+
+    inline bool getPrediction(uint8_t &count);
+
+    inline unsigned calcLocHistIdx(Addr &branch_addr);
+
+    inline void updateHistoriesTaken(unsigned local_history_idx);
+
+    inline void updateHistoriesNotTaken(unsigned local_history_idx);
+
+    /**
+     * Private counter class for the internal saturating counters.
+     * Implements an n bit saturating counter and provides methods to
+     * increment, decrement, and read it.
+     * @todo Consider making this something that more closely mimics a
+     * built in class so you can use ++ or --.
+     */
+    class SatCounter
+    {
+      public:
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         */
+        SatCounter(unsigned bits);
+
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         * @param initial_val Starting value for each counter.
+         */
+        SatCounter(unsigned bits, unsigned initial_val);
+
+        /**
+         * Increments the counter's current value.
+         */
+        void increment();
+
+        /**
+         * Decrements the counter's current value.
+         */
+        void decrement();
+
+        /**
+         * Read the counter's value.
+         */
+        uint8_t read()
+        {
+            return counter;
+        }
+
+      private:
+        uint8_t maxVal;
+        uint8_t counter;
+    };
+
+    /** Local counters. */
+    SatCounter *local_ctrs;
+
+    /** Size of the local predictor. */
+    unsigned local_predictor_size;
+
+    /** Number of bits of the local predictor's counters. */
+    unsigned local_ctr_bits;
+
+    /** Array of local history table entries. */
+    unsigned *local_history_table;
+
+    /** Size of the local history table. */
+    unsigned local_history_table_size;
+
+    /** Number of bits for each entry of the local history table.
+     *  @todo Doesn't this come from the size of the local predictor?
+     */
+    unsigned local_history_bits;
+
+    /** Mask to get the proper local history. */
+    unsigned localHistoryMask;
+
+
+    /** Array of counters that make up the global predictor. */
+    SatCounter *global_ctrs;
+
+    /** Size of the global predictor. */
+    unsigned global_predictor_size;
+
+    /** Number of bits of the global predictor's counters. */
+    unsigned global_ctr_bits;
+
+    /** Global history register. */
+    unsigned global_history;
+
+    /** Number of bits for the global history. */
+    unsigned global_history_bits;
+
+    /** Mask to get the proper global history. */
+    unsigned globalHistoryMask;
+
+
+    /** Array of counters that make up the choice predictor. */
+    SatCounter *choice_ctrs;
+
+    /** Size of the choice predictor (identical to the global predictor). */
+    unsigned choice_predictor_size;
+
+    /** Number of bits of the choice predictor's counters. */
+    unsigned choice_ctr_bits;
+
+    /** Number of bits to shift the instruction over to get rid of the word
+     *  offset.
+     */
+    unsigned instShiftAmt;
+
+    /** Threshold for the counter value; above the threshold is taken,
+     *  equal to or below the threshold is not taken.
+     */
+    unsigned threshold;
+};
+
+#endif // __TOURNAMENT_PRED_HH__