61 files changed, 1291 insertions, 723 deletions
diff --git a/configs/example/memtest.py b/configs/example/memtest.py
index 141ecfd8e..e42a92ba1 100644
--- a/configs/example/memtest.py
+++ b/configs/example/memtest.py
@@ -133,6 +133,6 @@ m5.instantiate(root)
 if options.maxtick:
     exit_event = m5.simulate(options.maxtick)
 else:
-    exit_event = m5.simulate()
+    exit_event = m5.simulate(10000000000000)
 
 print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
diff --git a/configs/splash2/run.py b/configs/splash2/run.py
index 93b166d77..7d56cb830 100644
--- a/configs/splash2/run.py
+++ b/configs/splash2/run.py
@@ -262,7 +262,7 @@ m5.instantiate(root)
 if options.maxtick:
     exit_event = m5.simulate(options.maxtick)
 else:
-    exit_event = m5.simulate()
+    exit_event = m5.simulate(1000000000000)
 
 print 'Exiting @ tick', m5.curTick(), 'because', exit_event.getCause()
 
diff --git a/src/SConscript b/src/SConscript
index 44bcb5320..385047f7f 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -129,12 +129,13 @@ base_sources = Split('''
 
         mem/cache/cache_builder.cc
 
+        python/swig/main_wrap.cc
+
 	sim/builder.cc
 	sim/debug.cc
 	sim/eventq.cc
 	sim/faults.cc
 	sim/main.cc
-        python/swig/cc_main_wrap.cc
 	sim/param.cc
 	sim/root.cc
 	sim/serialize.cc
@@ -316,16 +317,17 @@ else:
 
 makeEnv('debug', '.do',
         CCFLAGS = Split('%s -O0' % debug_flag),
-        CPPDEFINES = 'DEBUG')
+        CPPDEFINES = ['DEBUG', 'TRACING_ON=1'])
 
 # Optimized binary
 makeEnv('opt', '.o',
-        CCFLAGS = Split('-g -O3'))
+        CCFLAGS = Split('-g -O3'),
+        CPPDEFINES = ['TRACING_ON=1'])
 
 # "Fast" binary
 makeEnv('fast', '.fo', strip = True,
         CCFLAGS = Split('-O3'),
-        CPPDEFINES = 'NDEBUG')
+        CPPDEFINES = ['NDEBUG', 'TRACING_ON=0'])
 
 # Profiled binary
 makeEnv('prof', '.po',
diff --git a/src/arch/alpha/faults.hh b/src/arch/alpha/faults.hh
index 3ef4d5521..e2c3441e9 100644
--- a/src/arch/alpha/faults.hh
+++ b/src/arch/alpha/faults.hh
@@ -32,9 +32,13 @@
 #ifndef __ALPHA_FAULTS_HH__
 #define __ALPHA_FAULTS_HH__
 
-#include "arch/alpha/pagetable.hh"
+#include "config/full_system.hh"
 #include "sim/faults.hh"
 
+#if FULL_SYSTEM
+#include "arch/alpha/pagetable.hh"
+#endif
+
 // The design of the "name" and "vect" functions is in sim/faults.hh
 
 namespace AlphaISA
diff --git a/src/arch/alpha/pagetable.hh b/src/arch/alpha/pagetable.hh
index 3108c0a3e..7ec4a6a75 100644
--- a/src/arch/alpha/pagetable.hh
+++ b/src/arch/alpha/pagetable.hh
@@ -38,7 +38,6 @@
 
 namespace AlphaISA {
 
-#if FULL_SYSTEM
     struct VAddr
     {
         static const int ImplBits = 43;
@@ -106,7 +105,7 @@ namespace AlphaISA {
         void serialize(std::ostream &os);
         void unserialize(Checkpoint *cp, const std::string &section);
     };
-#endif
+
 };
 #endif // __ARCH_ALPHA_PAGETABLE_H__
 
diff --git a/src/arch/sparc/faults.cc b/src/arch/sparc/faults.cc
index 57ee040f1..4cf411d3b 100644
--- a/src/arch/sparc/faults.cc
+++ b/src/arch/sparc/faults.cc
@@ -283,7 +283,7 @@ void enterREDState(ThreadContext *tc)
     HPSTATE |= (1 << 5);
     //HPSTATE.hpriv = 1
     HPSTATE |= (1 << 2);
-    tc->setMiscReg(MISCREG_HPSTATE, HPSTATE);
+    tc->setMiscRegWithEffect(MISCREG_HPSTATE, HPSTATE);
 }
 
 /**
@@ -491,11 +491,11 @@ void doNormalFault(ThreadContext *tc, TrapType tt, bool gotoHpriv)
     }
 }
 
-void getREDVector(Addr & PC, Addr & NPC)
+void getREDVector(MiscReg TT, Addr & PC, Addr & NPC)
 {
     //XXX The following constant might belong in a header file.
-    const Addr RSTVAddr = 0xFFFFFFFFF0000000ULL;
-    PC = RSTVAddr | 0xA0;
+    const Addr RSTVAddr = 0xFFF0000000ULL;
+    PC = RSTVAddr | ((TT << 5) & 0xFF);
     NPC = PC + sizeof(MachInst);
 }
 
@@ -519,6 +519,7 @@ void getPrivVector(ThreadContext * tc, Addr & PC, Addr & NPC, MiscReg TT, MiscRe
 
 void SparcFaultBase::invoke(ThreadContext * tc)
 {
+    panic("Invoking a second fault!\n");
     FaultBase::invoke(tc);
     countStat()++;
 
@@ -543,7 +544,7 @@ void SparcFaultBase::invoke(ThreadContext * tc)
 
     if(HPSTATE & (1 << 5) || TL == MaxTL - 1)
     {
-        getREDVector(PC, NPC);
+        getREDVector(5, PC, NPC);
         enterREDState(tc);
         doREDFault(tc, TT);
     }
@@ -583,28 +584,41 @@ void PowerOnReset::invoke(ThreadContext * tc)
     //For SPARC, when a system is first started, there is a power
     //on reset Trap which sets the processor into the following state.
     //Bits that aren't set aren't defined on startup.
-    /*
-    tl = MaxTL;
-    gl = MaxGL;
 
-    tickFields.counter = 0; //The TICK register is unreadable bya
-    tickFields.npt = 1; //The TICK register is unreadable by by !priv
+    tc->setMiscReg(MISCREG_TL, MaxTL);
+    tc->setMiscReg(MISCREG_TT, trapType());
+    tc->setMiscRegWithEffect(MISCREG_GL, MaxGL);
+
+    //Turn on pef, set everything else to 0
+    tc->setMiscReg(MISCREG_PSTATE, 1 << 4);
 
-    softint = 0; // Clear all the soft interrupt bits
-    tick_cmprFields.int_dis = 1; // disable timer compare interrupts
+    //Turn on red and hpriv, set everything else to 0
+    tc->setMiscReg(MISCREG_HPSTATE, (1 << 5) | (1 << 2));
+
+    //The tick register is unreadable by nonprivileged software
+    tc->setMiscReg(MISCREG_TICK, 1ULL << 63);
+
+    Addr PC, NPC;
+    getREDVector(trapType(), PC, NPC);
+    tc->setPC(PC);
+    tc->setNextPC(NPC);
+    tc->setNextNPC(NPC + sizeof(MachInst));
+
+    //These registers are specified as "undefined" after a POR, and they
+    //should have reasonable values after the miscregfile is reset
+    /*
+    // Clear all the soft interrupt bits
+    softint = 0;
+    // disable timer compare interrupts, reset tick_cmpr
+    tc->setMiscReg(MISCREG_
+    tick_cmprFields.int_dis = 1;
     tick_cmprFields.tick_cmpr = 0; // Reset to 0 for pretty printing
     stickFields.npt = 1; //The TICK register is unreadable by by !priv
     stick_cmprFields.int_dis = 1; // disable timer compare interrupts
     stick_cmprFields.tick_cmpr = 0; // Reset to 0 for pretty printing
 
     tt[tl] = _trapType;
-    pstate = 0; // fields 0 but pef
-    pstateFields.pef = 1;
 
-    hpstate = 0;
-    hpstateFields.red = 1;
-    hpstateFields.hpriv = 1;
-    hpstateFields.tlz = 0; // this is a guess
     hintp = 0; // no interrupts pending
     hstick_cmprFields.int_dis = 1; // disable timer compare interrupts
     hstick_cmprFields.tick_cmpr = 0; // Reset to 0 for pretty printing
diff --git a/src/arch/sparc/intregfile.cc b/src/arch/sparc/intregfile.cc
index 0e313dc94..358368e5f 100644
--- a/src/arch/sparc/intregfile.cc
+++ b/src/arch/sparc/intregfile.cc
@@ -83,7 +83,7 @@ IntReg IntRegFile::readReg(int intReg)
     else if((intReg -= NumRegularIntRegs) < NumMicroIntRegs)
         val = microRegs[intReg];
     else
-        panic("Tried to read non-existant integer register\n");
+        panic("Tried to read non-existant integer register %d, %d\n", NumRegularIntRegs + NumMicroIntRegs + intReg, intReg);
 
     DPRINTF(Sparc, "Read register %d = 0x%x\n", intReg, val);
     return val;
@@ -123,7 +123,7 @@ void IntRegFile::setCWP(int cwp)
 
 void IntRegFile::setGlobals(int gl)
 {
-    DPRINTF(Sparc, "Now using %d globals", gl);
+    DPRINTF(Sparc, "Now using %d globals\n", gl);
 
     regView[Globals] = regGlobals[gl];
     offset[Globals] = RegGlobalOffset + gl * RegsPerFrame;
diff --git a/src/arch/sparc/isa/base.isa b/src/arch/sparc/isa/base.isa
index a4c022411..aa24c75be 100644
--- a/src/arch/sparc/isa/base.isa
+++ b/src/arch/sparc/isa/base.isa
@@ -189,6 +189,7 @@ output decoder {{
             const int MaxOutput = 16;
             const int MaxLocal = 24;
             const int MaxInput = 32;
+            const int MaxMicroReg = 33;
             if (reg == FramePointerReg)
                 ccprintf(os, "%%fp");
             else if (reg == StackPointerReg)
@@ -201,6 +202,8 @@ output decoder {{
                 ccprintf(os, "%%l%d", reg - MaxOutput);
             else if(reg < MaxInput)
                 ccprintf(os, "%%i%d", reg - MaxLocal);
+            else if(reg < MaxMicroReg)
+                ccprintf(os, "%%u%d", reg - MaxInput);
             else {
                 ccprintf(os, "%%f%d", reg - FP_Base_DepTag);
             }
@@ -241,7 +244,12 @@ output decoder {{
         bool passesCondition(uint32_t codes, uint32_t condition)
         {
             CondCodes condCodes;
-            condCodes.bits = codes;
+            condCodes.bits =  0;
+            condCodes.c = codes & 0x1 ? 1 : 0;
+            condCodes.v = codes & 0x2 ? 1 : 0;
+            condCodes.z = codes & 0x4 ? 1 : 0;
+            condCodes.n = codes & 0x8 ? 1 : 0;
+
             switch(condition)
             {
               case Always:
diff --git a/src/arch/sparc/isa/decoder.isa b/src/arch/sparc/isa/decoder.isa
index 4f3ea7810..2c8e59a1d 100644
--- a/src/arch/sparc/isa/decoder.isa
+++ b/src/arch/sparc/isa/decoder.isa
@@ -41,15 +41,16 @@ decode OP default Unknown::unknown()
         0x0: Trap::illtrap({{fault = new IllegalInstruction;}});
         format BranchN
         {
+            //bpcc
             0x1: decode COND2
             {
                 //Branch Always
                 0x8: decode A
                 {
-                    0x0: b(19, {{
+                    0x0: bpa(19, {{
                         NNPC = xc->readPC() + disp;
                     }});
-                    0x1: b(19, {{
+                    0x1: bpa(19, {{
                         NPC = xc->readPC() + disp;
                         NNPC = NPC + 4;
                     }}, ',a');
@@ -57,10 +58,10 @@ decode OP default Unknown::unknown()
                 //Branch Never
                 0x0: decode A
                 {
-                    0x0: bn(19, {{
+                    0x0: bpn(19, {{
                         NNPC = NNPC;//Don't do anything
                     }});
-                    0x1: bn(19, {{
+                    0x1: bpn(19, {{
                         NPC = xc->readNextPC() + 4;
                         NNPC = NPC + 4;
                     }}, ',a');
@@ -81,12 +82,38 @@ decode OP default Unknown::unknown()
                     }});
                 }
             }
-            0x2: bicc(22, {{
-                if(passesCondition(Ccr<3:0>, COND2))
-                    NNPC = xc->readPC() + disp;
-                else
-                    handle_annul
-            }});
+            //bicc
+            0x2: decode COND2
+            {
+                //Branch Always
+                0x8: decode A
+                {
+                    0x0: ba(22, {{
+                        NNPC = xc->readPC() + disp;
+                    }});
+                    0x1: ba(22, {{
+                        NPC = xc->readPC() + disp;
+                        NNPC = NPC + 4;
+                    }}, ',a');
+                }
+                //Branch Never
+                0x0: decode A
+                {
+                    0x0: bn(22, {{
+                        NNPC = NNPC;//Don't do anything
+                    }});
+                    0x1: bn(22, {{
+                        NPC = xc->readNextPC() + 4;
+                        NNPC = NPC + 4;
+                    }}, ',a');
+                }
+                default: bicc(22, {{
+                    if(passesCondition(Ccr<3:0>, COND2))
+                        NNPC = xc->readPC() + disp;
+                    else
+                        handle_annul
+                }});
+            }
         }
         0x3: decode RCOND2
         {
@@ -380,7 +407,15 @@ decode OP default Unknown::unknown()
                 0x17: Priv::rdtick_cmpr({{Rd = TickCmpr;}});
                 0x18: PrivCheck::rdstick({{Rd = Stick}}, {{Stick<63:>}});
                 0x19: Priv::rdstick_cmpr({{Rd = StickCmpr;}});
-                //0x1A-0x1F should cause an illegal instruction exception
+                0x1A: Priv::rdstrand_sts_reg({{
+                    if(Pstate<2:> && !Hpstate<2:>)
+                        Rd = StrandStsReg<0:>;
+                    else
+                        Rd = StrandStsReg;
+                }});
+                //0x1A is supposed to be reserved, but it reads the strand
+                //status register.
+                //0x1B-0x1F should cause an illegal instruction exception
             }
             0x29: decode RS1 {
                 0x00: HPriv::rdhprhpstate({{Rd = Hpstate;}});
@@ -515,7 +550,16 @@ decode OP default Unknown::unknown()
                     Stick = Rs1 ^ Rs2_or_imm13;
                 }});
                 0x19: Priv::wrstick_cmpr({{StickCmpr = Rs1 ^ Rs2_or_imm13;}});
-                //0x1A-0x1F should cause an illegal instruction exception
+                0x1A: Priv::wrstrand_sts_reg({{
+                    if(Pstate<2:> && !Hpstate<2:>)
+                        StrandStsReg = StrandStsReg<63:1> |
+                                (Rs1 ^ Rs2_or_imm13)<0:>;
+                    else
+                        StrandStsReg = Rs1 ^ Rs2_or_imm13;
+                }});
+                //0x1A is supposed to be reserved, but it writes the strand
+                //status register.
+                //0x1B-0x1F should cause an illegal instruction exception
             }
             0x31: decode FCN {
                 0x0: Priv::saved({{
@@ -527,7 +571,7 @@ decode OP default Unknown::unknown()
                     else
                         Otherwin = Otherwin - 1;
                 }});
-                0x1: BasicOperate::restored({{
+                0x1: Priv::restored({{
                     assert(Cansave || Otherwin);
                     assert(Canrestore < NWindows - 2);
                     Canrestore = Canrestore + 1;
diff --git a/src/arch/sparc/isa/formats/priv.isa b/src/arch/sparc/isa/formats/priv.isa
index 55bf968f4..94a68aebe 100644
--- a/src/arch/sparc/isa/formats/priv.isa
+++ b/src/arch/sparc/isa/formats/priv.isa
@@ -50,6 +50,42 @@ output header {{
                     const SymbolTable *symtab) const;
         };
 
+        //This class is for instructions that explicitly read control
+        //registers. It provides a special generateDisassembly function.
+        class RdPriv : public Priv
+        {
+          protected:
+            //Constructor
+            RdPriv(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, char const * _regName) :
+                Priv(mnem, _machInst, __opClass), regName(_regName)
+            {
+            }
+
+            std::string generateDisassembly(Addr pc,
+                    const SymbolTable *symtab) const;
+
+            char const * regName;
+        };
+
+        //This class is for instructions that explicitly write control
+        //registers. It provides a special generateDisassembly function.
+        class WrPriv : public Priv
+        {
+          protected:
+            //Constructor
+            WrPriv(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, char const * _regName) :
+                Priv(mnem, _machInst, __opClass), regName(_regName)
+            {
+            }
+
+            std::string generateDisassembly(Addr pc,
+                    const SymbolTable *symtab) const;
+
+            char const * regName;
+        };
+
         /**
          * Base class for privelege mode operations with immediates.
          */
@@ -66,6 +102,23 @@ output header {{
             int32_t imm;
         };
 
+        //This class is for instructions that explicitly write control
+        //registers. It provides a special generateDisassembly function.
+        class WrPrivImm : public PrivImm
+        {
+          protected:
+            //Constructor
+            WrPrivImm(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, char const * _regName) :
+                PrivImm(mnem, _machInst, __opClass), regName(_regName)
+            {
+            }
+
+            std::string generateDisassembly(Addr pc,
+                    const SymbolTable *symtab) const;
+
+            char const * regName;
+        };
 }};
 
 output decoder {{
@@ -78,6 +131,58 @@ output decoder {{
 
             return response.str();
         }
+
+        std::string RdPriv::generateDisassembly(Addr pc,
+                const SymbolTable *symtab) const
+        {
+            std::stringstream response;
+
+            printMnemonic(response, mnemonic);
+
+            ccprintf(response, " %%%s, ", regName);
+            printDestReg(response, 0);
+
+            return response.str();
+        }
+
+        std::string WrPriv::generateDisassembly(Addr pc,
+                const SymbolTable *symtab) const
+        {
+            std::stringstream response;
+
+            printMnemonic(response, mnemonic);
+
+            ccprintf(response, " ");
+            printSrcReg(response, 0);
+            ccprintf(response, ", ");
+            printSrcReg(response, 1);
+            ccprintf(response, ", %%%s", regName);
+
+            return response.str();
+        }
+
+        std::string WrPrivImm::generateDisassembly(Addr pc,
+                const SymbolTable *symtab) const
+        {
+            std::stringstream response;
+
+            printMnemonic(response, mnemonic);
+
+            ccprintf(response, " ");
+            printSrcReg(response, 0);
+            ccprintf(response, ", 0x%x, %%%s", imm, regName);
+
+            return response.str();
+        }
+}};
+
+def template ControlRegConstructor {{
+        inline %(class_name)s::%(class_name)s(ExtMachInst machInst)
+            : %(base_class)s("%(mnemonic)s", machInst,
+                    %(op_class)s, "%(reg_name)s")
+        {
+                %(constructor)s;
+        }
 }};
 
 def template PrivExecute {{
@@ -102,16 +207,39 @@ let {{
     def doPrivFormat(code, checkCode, name, Name, opt_flags):
         (usesImm, code, immCode,
          rString, iString) = splitOutImm(code)
-        iop = InstObjParams(name, Name, 'Priv', code,
-                opt_flags, {"check": checkCode})
+        #If these are rd, rdpr, rdhpr, wr, wrpr, or wrhpr instructions,
+        #cut any other info out of the mnemonic. Also pick a different
+        #base class.
+        regBase = 'Priv'
+        regName = ''
+        for mnem in ["rdhpr", "rdpr", "rd"]:
+            if name.startswith(mnem):
+                regName = name[len(mnem):]
+                name = mnem
+                regBase = 'RdPriv'
+                break
+        for mnem in ["wrhpr", "wrpr", "wr"]:
+            if name.startswith(mnem):
+                regName = name[len(mnem):]
+                name = mnem
+                regBase = 'WrPriv'
+                break
+        iop = InstObjParams(name, Name, regBase, code,
+                opt_flags, {"check": checkCode, "reg_name": regName})
         header_output = BasicDeclare.subst(iop)
-        decoder_output = BasicConstructor.subst(iop)
+        if regName == '':
+            decoder_output = BasicConstructor.subst(iop)
+        else:
+            decoder_output = ControlRegConstructor.subst(iop)
         exec_output = PrivExecute.subst(iop)
         if usesImm:
-            imm_iop = InstObjParams(name, Name + 'Imm', 'PrivImm',
-                    immCode, opt_flags, {"check": checkCode})
+            imm_iop = InstObjParams(name, Name + 'Imm', regBase + 'Imm',
+                    immCode, opt_flags, {"check": checkCode, "reg_name": regName})
             header_output += BasicDeclare.subst(imm_iop)
-            decoder_output += BasicConstructor.subst(imm_iop)
+            if regName == '':
+                decoder_output += BasicConstructor.subst(imm_iop)
+            else:
+                decoder_output += ControlRegConstructor.subst(imm_iop)
             exec_output += PrivExecute.subst(imm_iop)
             decode_block = ROrImmDecode.subst(iop)
         else:
diff --git a/src/arch/sparc/isa/operands.isa b/src/arch/sparc/isa/operands.isa
index caee20b0c..2d200f568 100644
--- a/src/arch/sparc/isa/operands.isa
+++ b/src/arch/sparc/isa/operands.isa
@@ -123,6 +123,7 @@ def operands {{
     'Htba':		('ControlReg', 'udw', 'MISCREG_HTBA', None, 72),
     'HstickCmpr':	('ControlReg', 'udw', 'MISCREG_HSTICK_CMPR', None, 73),
     'Hver':		('ControlReg', 'udw', 'MISCREG_HVER', None, 74),
+    'StrandStsReg':	('ControlReg', 'udw', 'MISCREG_STRAND_STS_REG', None, 75),
 
     'Fsr':		('ControlReg', 'udw', 'MISCREG_FSR', None, 80),
     # Mem gets a large number so it's always last
diff --git a/src/arch/sparc/isa_traits.hh b/src/arch/sparc/isa_traits.hh
index 46a0ebbfb..1433ba3f8 100644
--- a/src/arch/sparc/isa_traits.hh
+++ b/src/arch/sparc/isa_traits.hh
@@ -76,10 +76,7 @@ namespace SparcISA
         // 0..31 are the integer regs 0..31
         // 32..95 are the FP regs 0..31, i.e. use (reg + FP_Base_DepTag)
         FP_Base_DepTag = NumIntRegs,
-        Ctrl_Base_DepTag = NumIntRegs + NumFloatRegs,
-        //XXX These are here solely to get compilation and won't work
-        Fpcr_DepTag = 0,
-        Uniq_DepTag = 0
+        Ctrl_Base_DepTag = NumIntRegs + NumMicroIntRegs + NumFloatRegs,
     };
 
 
diff --git a/src/arch/sparc/tlb.hh b/src/arch/sparc/tlb.hh
index 0fdba6baf..136103f44 100644
--- a/src/arch/sparc/tlb.hh
+++ b/src/arch/sparc/tlb.hh
@@ -31,6 +31,7 @@
 #ifndef __ARCH_SPARC_TLB_HH__
 #define __ARCH_SPARC_TLB_HH__
 
+#include "base/misc.hh"
 #include "mem/request.hh"
 #include "sim/faults.hh"
 #include "sim/sim_object.hh"
@@ -39,6 +40,9 @@ class ThreadContext;
 
 namespace SparcISA
 {
+    const int PAddrImplBits = 40;
+    const Addr PAddrImplMask = (ULL(1) << PAddrImplBits) - 1;
+
     class TLB : public SimObject
     {
       public:
@@ -56,6 +60,9 @@ namespace SparcISA
 
         Fault translate(RequestPtr &req, ThreadContext *tc) const
         {
+            //For now, always assume the address is already physical.
+            //Also assume that there are 40 bits of physical address space.
+            req->setPaddr(req->getVaddr() & PAddrImplMask);
             return NoFault;
         }
     };
@@ -69,6 +76,9 @@ namespace SparcISA
 
         Fault translate(RequestPtr &req, ThreadContext *tc, bool write) const
         {
+            //For now, always assume the address is already physical.
+            //Also assume that there are 40 bits of physical address space.
+            req->setPaddr(req->getVaddr() & ((1ULL << 40) - 1));
             return NoFault;
         }
     };
diff --git a/src/arch/sparc/utility.hh b/src/arch/sparc/utility.hh
index e51677cdf..5c7fe343d 100644
--- a/src/arch/sparc/utility.hh
+++ b/src/arch/sparc/utility.hh
@@ -33,6 +33,7 @@
 
 #include "arch/sparc/faults.hh"
 #include "arch/sparc/isa_traits.hh"
+#include "arch/sparc/tlb.hh"
 #include "base/misc.hh"
 #include "base/bitfield.hh"
 #include "cpu/thread_context.hh"
diff --git a/src/base/loader/raw_object.cc b/src/base/loader/raw_object.cc
index 79ddb81fe..1faf33426 100644
--- a/src/base/loader/raw_object.cc
+++ b/src/base/loader/raw_object.cc
@@ -29,6 +29,7 @@
  */
 
 #include "base/loader/raw_object.hh"
+#include "base/loader/symtab.hh"
 #include "base/trace.hh"
 
 ObjectFile *
@@ -62,11 +63,19 @@ RawObject::RawObject(const std::string &_filename, int _fd, size_t _len,
 bool
 RawObject::loadGlobalSymbols(SymbolTable *symtab)
 {
+    int fnameStart = filename.rfind('/',filename.size()) + 1;
+    int extStart = filename.rfind('.',filename.size());
+    symtab->insert(text.baseAddr, filename.substr(fnameStart,
+                extStart-fnameStart) + "_start");
     return true;
 }
 
 bool
 RawObject::loadLocalSymbols(SymbolTable *symtab)
 {
+    int fnameStart = filename.rfind('/',filename.size()) + 1;
+    int extStart = filename.rfind('.',filename.size());
+    symtab->insert(text.baseAddr, filename.substr(fnameStart,
+                extStart-fnameStart) + "_start");
     return true;
 }
diff --git a/src/base/remote_gdb.cc b/src/base/remote_gdb.cc
index 55fb97ce9..59a9b87d5 100644
--- a/src/base/remote_gdb.cc
+++ b/src/base/remote_gdb.cc
@@ -421,7 +421,7 @@ BaseRemoteGDB::recv(char *bp, int maxlen)
                 putbyte(bp[0]);
                 putbyte(bp[1]);
                 len -= 3;
-                bcopy(bp + 3, bp, len);
+                memcpy(bp, bp+3, len);
             }
             break;
         }
diff --git a/src/base/trace.hh b/src/base/trace.hh
index 8df5dd893..9b053990c 100644
--- a/src/base/trace.hh
+++ b/src/base/trace.hh
@@ -39,14 +39,6 @@
 #include "sim/host.hh"
 #include "sim/root.hh"
 
-#ifndef TRACING_ON
-#ifndef NDEBUG
-#define TRACING_ON	1
-#else
-#define TRACING_ON	0
-#endif
-#endif
-
 #include "base/traceflags.hh"
 
 namespace Trace {
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 9257778ef..788f77e3a 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -155,6 +155,10 @@ class BaseCPU : public MemObject
         int cpu_id;
 #if FULL_SYSTEM
         Tick profile;
+
+        bool do_statistics_insts;
+        bool do_checkpoint_insts;
+        bool do_quiesce;
 #endif
         Tick progress_interval;
         BaseCPU *checker;
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index c68810954..4a4555566 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -206,6 +206,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
      */
     Result instResult;
 
+    /** Records changes to result? */
+    bool recordResult;
+
     /** PC of this instruction. */
     Addr PC;
 
@@ -263,6 +266,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Dumps out contents of this BaseDynInst into given string. */
     void dump(std::string &outstring);
 
+    /** Read this CPU's ID. */
+    int readCpuId() { return cpu->readCpuId(); }
+
     /** Returns the fault type. */
     Fault getFault() { return fault; }
 
@@ -402,37 +408,42 @@ class BaseDynInst : public FastAlloc, public RefCounted
     /** Records an integer register being set to a value. */
     void setIntReg(const StaticInst *si, int idx, uint64_t val)
     {
-        instResult.integer = val;
+        if (recordResult)
+            instResult.integer = val;
     }
 
     /** Records an fp register being set to a value. */
     void setFloatReg(const StaticInst *si, int idx, FloatReg val, int width)
     {
-        if (width == 32)
-            instResult.dbl = (double)val;
-        else if (width == 64)
-            instResult.dbl = val;
-        else
-            panic("Unsupported width!");
+        if (recordResult) {
+            if (width == 32)
+                instResult.dbl = (double)val;
+            else if (width == 64)
+                instResult.dbl = val;
+            else
+                panic("Unsupported width!");
+        }
     }
 
     /** Records an fp register being set to a value. */
     void setFloatReg(const StaticInst *si, int idx, FloatReg val)
     {
-//        instResult.fp = val;
-        instResult.dbl = (double)val;
+        if (recordResult)
+            instResult.dbl = (double)val;
     }
 
     /** Records an fp register being set to an integer value. */
     void setFloatRegBits(const StaticInst *si, int idx, uint64_t val, int width)
     {
-        instResult.integer = val;
+        if (recordResult)
+            instResult.integer = val;
     }
 
     /** Records an fp register being set to an integer value. */
     void setFloatRegBits(const StaticInst *si, int idx, uint64_t val)
     {
-        instResult.integer = val;
+        if (recordResult)
+            instResult.integer = val;
     }
 
     /** Records that one of the source registers is ready. */
@@ -624,6 +635,15 @@ class BaseDynInst : public FastAlloc, public RefCounted
 
     /** Sets iterator for this instruction in the list of all insts. */
     void setInstListIt(ListIt _instListIt) { instListIt = _instListIt; }
+
+  public:
+    /** Returns the number of consecutive store conditional failures. */
+    unsigned readStCondFailures()
+    { return thread->storeCondFailures; }
+
+    /** Sets the number of consecutive store conditional failures. */
+    void setStCondFailures(unsigned sc_failures)
+    { thread->storeCondFailures = sc_failures; }
 };
 
 template<class Impl>
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index d6cdff5c5..2f6859de2 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -97,6 +97,7 @@ BaseDynInst<Impl>::initVars()
     readyRegs = 0;
 
     instResult.integer = 0;
+    recordResult = true;
 
     status.reset();
 
diff --git a/src/cpu/exetrace.cc b/src/cpu/exetrace.cc
index ef06e0699..113f0fe74 100644
--- a/src/cpu/exetrace.cc
+++ b/src/cpu/exetrace.cc
@@ -39,12 +39,17 @@
 #include "arch/regfile.hh"
 #include "arch/utility.hh"
 #include "base/loader/symtab.hh"
+#include "config/full_system.hh"
 #include "cpu/base.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/static_inst.hh"
 #include "sim/param.hh"
 #include "sim/system.hh"
 
+#if FULL_SYSTEM
+#include "arch/tlb.hh"
+#endif
+
 //XXX This is temporary
 #include "arch/isa_specific.hh"
 #include "cpu/m5legion_interface.h"
@@ -232,17 +237,22 @@ Trace::InstRecord::dump(ostream &outs)
         bool diffPC   = false;
         bool diffInst = false;
         bool diffRegs = false;
+        Addr m5Pc, lgnPc;
+
 
         if(!staticInst->isMicroOp() || staticInst->isLastMicroOp()) {
             while (!compared) {
+                m5Pc = PC & TheISA::PAddrImplMask;
+                lgnPc = shared_data->pc & TheISA::PAddrImplMask;
                 if (shared_data->flags == OWN_M5) {
-                    if (shared_data->pc != PC)
+                    if (lgnPc != m5Pc)
                        diffPC = true;
                     if (shared_data->instruction != staticInst->machInst)
                         diffInst = true;
-                    for (int i = 0; i < TheISA::NumIntRegs; i++) {
-                        if (thread->readIntReg(i) != shared_data->intregs[i])
+                    for (int i = 0; i < TheISA::NumRegularIntRegs; i++) {
+                        if (thread->readIntReg(i) != shared_data->intregs[i]) {
                             diffRegs = true;
+                        }
                     }
 
                     if (diffPC || diffInst || diffRegs ) {
@@ -253,19 +263,19 @@ Trace::InstRecord::dump(ostream &outs)
                             outs << " [Instruction]";
                         if (diffRegs)
                             outs << " [IntRegs]";
-                        outs << endl << endl;;
+                        outs << endl << endl;
 
-                        outs << setfill(' ') << setw(15)
+                        outs << right << setfill(' ') << setw(15)
                              << "M5 PC: " << "0x"<< setw(16) << setfill('0')
-                             << hex << PC << endl;
+                             << hex << m5Pc << endl;
                         outs << setfill(' ') << setw(15)
                              << "Legion PC: " << "0x"<< setw(16) << setfill('0') << hex
-                             << shared_data->pc << endl << endl;
+                             << lgnPc << endl << endl;
 
                         outs << setfill(' ') << setw(15)
                              << "M5 Inst: "  << "0x"<< setw(8)
                              << setfill('0') << hex << staticInst->machInst
-                             << staticInst->disassemble(PC, debugSymbolTable)
+                             << staticInst->disassemble(m5Pc, debugSymbolTable)
                              << endl;
 
                         StaticInstPtr legionInst = StaticInst::decode(makeExtMI(shared_data->instruction, thread));
@@ -273,7 +283,7 @@ Trace::InstRecord::dump(ostream &outs)
                              << " Legion Inst: "
                              << "0x" << setw(8) << setfill('0') << hex
                              << shared_data->instruction
-                             << legionInst->disassemble(shared_data->pc, debugSymbolTable)
+                             << legionInst->disassemble(lgnPc, debugSymbolTable)
                              << endl;
 
                         outs << endl;
@@ -386,7 +396,7 @@ Trace::InstRecord::setParams()
     // If were going to be in lockstep with Legion
     // Setup shared memory, and get otherwise ready
     if (flags[LEGION_LOCKSTEP]) {
-        int shmfd = shmget(getuid(), sizeof(SharedData), 0777);
+        int shmfd = shmget('M' << 24 | getuid(), sizeof(SharedData), 0777);
         if (shmfd < 0)
             fatal("Couldn't get shared memory fd. Is Legion running?");
 
@@ -401,6 +411,8 @@ Trace::InstRecord::setParams()
             fatal("Shared Data is wrong version! M5: %d Legion: %d", VERSION,
                     shared_data->version);
 
+        // step legion forward one cycle so we can get register values
+        shared_data->flags = OWN_LEGION;
     }
 }
 
diff --git a/src/cpu/m5legion_interface.h b/src/cpu/m5legion_interface.h
index 9338d9ca0..373fbeb11 100644
--- a/src/cpu/m5legion_interface.h
+++ b/src/cpu/m5legion_interface.h
@@ -30,7 +30,7 @@
 
 #include <unistd.h>
 
-#define VERSION         0xA1000002
+#define VERSION         0xA1000005
 #define OWN_M5          0x000000AA
 #define OWN_LEGION      0x00000055
 
@@ -41,9 +41,35 @@ typedef struct {
     uint32_t version;
 
     uint64_t pc;
+    uint64_t new_pc;
     uint32_t instruction;
+    uint32_t new_instruction;
     uint64_t intregs[32];
 
+    uint64_t tpc[8];
+    uint64_t tnpc[8];
+    uint64_t tstate[8];
+    uint16_t tt[8];
+    uint64_t tba;
+
+    uint64_t hpstate;
+    uint64_t htstate[8];
+    uint64_t htba;
+    uint16_t pstate;
+
+    uint64_t y;
+    uint8_t ccr;
+    uint8_t tl;
+    uint8_t gl;
+    uint8_t asi;
+    uint8_t pil;
+
+    uint8_t cwp;
+    uint8_t cansave;
+    uint8_t canrestore;
+    uint8_t otherwin;
+    uint8_t cleanwin;
+
 } SharedData;
 
 /** !!! ^^^  Increment VERSION on change ^^^ !!! **/
diff --git a/src/cpu/o3/alpha/cpu_builder.cc b/src/cpu/o3/alpha/cpu_builder.cc
index be8ad8de6..09ccc7f65 100644
--- a/src/cpu/o3/alpha/cpu_builder.cc
+++ b/src/cpu/o3/alpha/cpu_builder.cc
@@ -57,6 +57,10 @@ Param<int> cpu_id;
 SimObjectParam<AlphaISA::ITB *> itb;
 SimObjectParam<AlphaISA::DTB *> dtb;
 Param<Tick> profile;
+
+Param<bool> do_quiesce;
+Param<bool> do_checkpoint_insts;
+Param<bool> do_statistics_insts;
 #else
 SimObjectVectorParam<Process *> workload;
 #endif // FULL_SYSTEM
@@ -163,6 +167,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU)
     INIT_PARAM(itb, "Instruction translation buffer"),
     INIT_PARAM(dtb, "Data translation buffer"),
     INIT_PARAM(profile, ""),
+
+    INIT_PARAM(do_quiesce, ""),
+    INIT_PARAM(do_checkpoint_insts, ""),
+    INIT_PARAM(do_statistics_insts, ""),
 #else
     INIT_PARAM(workload, "Processes to run"),
 #endif // FULL_SYSTEM
@@ -306,6 +314,10 @@ CREATE_SIM_OBJECT(DerivO3CPU)
     params->itb = itb;
     params->dtb = dtb;
     params->profile = profile;
+
+    params->do_quiesce = do_quiesce;
+    params->do_checkpoint_insts = do_checkpoint_insts;
+    params->do_statistics_insts = do_statistics_insts;
 #else
     params->workload = workload;
 #endif // FULL_SYSTEM
diff --git a/src/cpu/o3/alpha/cpu_impl.hh b/src/cpu/o3/alpha/cpu_impl.hh
index f5c394826..b2ef78360 100644
--- a/src/cpu/o3/alpha/cpu_impl.hh
+++ b/src/cpu/o3/alpha/cpu_impl.hh
@@ -231,7 +231,7 @@ Fault
 AlphaO3CPU<Impl>::hwrei(unsigned tid)
 {
     // Need to clear the lock flag upon returning from an interrupt.
-    this->lockFlag = false;
+    this->setMiscReg(AlphaISA::MISCREG_LOCKFLAG, false, tid);
 
     this->thread[tid]->kernelStats->hwrei();
 
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index b1fae8cf0..a5478d4f8 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -62,7 +62,8 @@ template<class Impl>
 void
 DefaultFetch<Impl>::IcachePort::recvFunctional(PacketPtr pkt)
 {
-    warn("Default fetch doesn't update it's state from a functional call.");
+    DPRINTF(Fetch, "DefaultFetch doesn't update its state from a "
+            "functional call.");
 }
 
 template<class Impl>
@@ -79,6 +80,7 @@ template<class Impl>
 bool
 DefaultFetch<Impl>::IcachePort::recvTiming(PacketPtr pkt)
 {
+    DPRINTF(Fetch, "Received timing\n");
     if (pkt->isResponse()) {
         fetch->processCacheCompletion(pkt);
     }
@@ -1171,8 +1173,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-//                warn("%lli: Quiesce instruction encountered, halting fetch!",
-//                     curTick);
+                DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!",
+                        curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1286,11 +1288,13 @@ DefaultFetch<Impl>::fetch(bool &status_change)
 
         fetchStatus[tid] = TrapPending;
         status_change = true;
-
-//        warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
 #else // !FULL_SYSTEM
-        warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]);
+        fetchStatus[tid] = TrapPending;
+        status_change = true;
+
 #endif // FULL_SYSTEM
+        DPRINTF(Fetch, "[tid:%i]: fault (%s) detected @ PC %08p",
+                tid, fault->name(), PC[tid]);
     }
 }
 
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 1b207fdbc..a2e11173e 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -37,6 +37,7 @@
 #include <queue>
 
 #include "arch/faults.hh"
+#include "arch/locked_mem.hh"
 #include "config/full_system.hh"
 #include "base/hashmap.hh"
 #include "cpu/inst_seq.hh"
@@ -510,8 +511,12 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
 
 #if FULL_SYSTEM
     if (req->isLocked()) {
-        cpu->lockAddr = req->getPaddr();
-        cpu->lockFlag = true;
+        // Disable recording the result temporarily.  Writing to misc
+        // regs normally updates the result, but this is not the
+        // desired behavior when handling store conditionals.
+        load_inst->recordResult = false;
+        TheISA::handleLockedRead(load_inst.get(), req);
+        load_inst->recordResult = true;
     }
 #endif
 
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 9a0e48819..4facea9f9 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -29,6 +29,7 @@
  *          Korey Sewell
  */
 
+#include "arch/locked_mem.hh"
 #include "config/use_checker.hh"
 
 #include "cpu/o3/lsq.hh"
@@ -615,27 +616,24 @@ LSQUnit<Impl>::writebackStores()
 
         // @todo: Remove this SC hack once the memory system handles it.
         if (req->isLocked()) {
-            if (req->isUncacheable()) {
-                req->setScResult(2);
-            } else {
-                if (cpu->lockFlag) {
-                    req->setScResult(1);
-                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] succeeded.",
-                            inst->seqNum);
-                } else {
-                    req->setScResult(0);
-                    // Hack: Instantly complete this store.
-//                    completeDataAccess(data_pkt);
-                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
-                            "Instantly completing it.\n",
-                            inst->seqNum);
-                    WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
-                    wb->schedule(curTick + 1);
-                    delete state;
-                    completeStore(storeWBIdx);
-                    incrStIdx(storeWBIdx);
-                    continue;
-                }
+            // Disable recording the result temporarily.  Writing to
+            // misc regs normally updates the result, but this is not
+            // the desired behavior when handling store conditionals.
+            inst->recordResult = false;
+            bool success = TheISA::handleLockedWrite(inst.get(), req);
+            inst->recordResult = true;
+
+            if (!success) {
+                // Instantly complete this store.
+                DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
+                        "Instantly completing it.\n",
+                        inst->seqNum);
+                WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                wb->schedule(curTick + 1);
+                delete state;
+                completeStore(storeWBIdx);
+                incrStIdx(storeWBIdx);
+                continue;
             }
         } else {
             // Non-store conditionals do not need a writeback.
diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc
index 39337dbff..155f0ce09 100644
--- a/src/cpu/ozone/cpu_builder.cc
+++ b/src/cpu/ozone/cpu_builder.cc
@@ -64,6 +64,10 @@ Param<int> cpu_id;
 SimObjectParam<TheISA::ITB *> itb;
 SimObjectParam<TheISA::DTB *> dtb;
 Param<Tick> profile;
+
+Param<bool> do_quiesce;
+Param<bool> do_checkpoint_insts;
+Param<bool> do_statistics_insts
 #else
 SimObjectVectorParam<Process *> workload;
 //SimObjectParam<PageTable *> page_table;
@@ -184,6 +188,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU)
     INIT_PARAM(itb, "Instruction translation buffer"),
     INIT_PARAM(dtb, "Data translation buffer"),
     INIT_PARAM(profile, ""),
+    INIT_PARAM(do_quiesce, ""),
+    INIT_PARAM(do_checkpoint_insts, ""),
+    INIT_PARAM(do_statistics_insts, ""),
 #else
     INIT_PARAM(workload, "Processes to run"),
 //    INIT_PARAM(page_table, "Page table"),
@@ -341,6 +348,9 @@ CREATE_SIM_OBJECT(DerivOzoneCPU)
     params->itb = itb;
     params->dtb = dtb;
     params->profile = profile;
+    params->do_quiesce = do_quiesce;
+    params->do_checkpoint_insts = do_checkpoint_insts;
+    params->do_statistics_insts = do_statistics_insts;
 #else
     params->workload = workload;
 //    params->pTable = page_table;
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index f94ea0917..58dc1fe5f 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -503,6 +503,10 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
     SimObjectParam<TheISA::ITB *> itb;
     SimObjectParam<TheISA::DTB *> dtb;
     Param<Tick> profile;
+
+    Param<bool> do_quiesce;
+    Param<bool> do_checkpoint_insts;
+    Param<bool> do_statistics_insts;
 #else
     SimObjectParam<Process *> workload;
 #endif // FULL_SYSTEM
@@ -535,6 +539,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
     INIT_PARAM(itb, "Instruction TLB"),
     INIT_PARAM(dtb, "Data TLB"),
     INIT_PARAM(profile, ""),
+    INIT_PARAM(do_quiesce, ""),
+    INIT_PARAM(do_checkpoint_insts, ""),
+    INIT_PARAM(do_statistics_insts, ""),
 #else
     INIT_PARAM(workload, "processes to run"),
 #endif // FULL_SYSTEM
@@ -572,6 +579,9 @@ CREATE_SIM_OBJECT(AtomicSimpleCPU)
     params->itb = itb;
     params->dtb = dtb;
     params->profile = profile;
+    params->do_quiesce = do_quiesce;
+    params->do_checkpoint_insts = do_checkpoint_insts;
+    params->do_statistics_insts = do_statistics_insts;
 #else
     params->process = workload;
 #endif
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index abf316095..db2c940c0 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -665,6 +665,10 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(TimingSimpleCPU)
     SimObjectParam<TheISA::ITB *> itb;
     SimObjectParam<TheISA::DTB *> dtb;
     Param<Tick> profile;
+
+    Param<bool> do_quiesce;
+    Param<bool> do_checkpoint_insts;
+    Param<bool> do_statistics_insts;
 #else
     SimObjectParam<Process *> workload;
 #endif // FULL_SYSTEM
@@ -697,6 +701,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(TimingSimpleCPU)
     INIT_PARAM(itb, "Instruction TLB"),
     INIT_PARAM(dtb, "Data TLB"),
     INIT_PARAM(profile, ""),
+    INIT_PARAM(do_quiesce, ""),
+    INIT_PARAM(do_checkpoint_insts, ""),
+    INIT_PARAM(do_statistics_insts, ""),
 #else
     INIT_PARAM(workload, "processes to run"),
 #endif // FULL_SYSTEM
@@ -732,6 +739,9 @@ CREATE_SIM_OBJECT(TimingSimpleCPU)
     params->itb = itb;
     params->dtb = dtb;
     params->profile = profile;
+    params->do_quiesce = do_quiesce;
+    params->do_checkpoint_insts = do_checkpoint_insts;
+    params->do_statistics_insts = do_statistics_insts;
 #else
     params->process = workload;
 #endif
diff --git a/src/dev/ethertap.cc b/src/dev/ethertap.cc
index 2d72383c5..65089a8b2 100644
--- a/src/dev/ethertap.cc
+++ b/src/dev/ethertap.cc
@@ -178,7 +178,7 @@ EtherTap::recvPacket(EthPacketPtr packet)
 
     DPRINTF(Ethernet, "EtherTap output len=%d\n", packet->length);
     DDUMP(EthernetData, packet->data, packet->length);
-    u_int32_t len = htonl(packet->length);
+    uint32_t len = htonl(packet->length);
     write(socket, &len, sizeof(len));
     write(socket, packet->data, packet->length);
 
@@ -199,11 +199,11 @@ EtherTap::process(int revent)
         return;
     }
 
-    char *data = buffer + sizeof(u_int32_t);
+    char *data = buffer + sizeof(uint32_t);
     if (!(revent & POLLIN))
         return;
 
-    if (buffer_offset < data_len + sizeof(u_int32_t)) {
+    if (buffer_offset < data_len + sizeof(uint32_t)) {
         int len = read(socket, buffer + buffer_offset, buflen - buffer_offset);
         if (len == 0) {
             detach();
@@ -213,23 +213,23 @@ EtherTap::process(int revent)
         buffer_offset += len;
 
         if (data_len == 0)
-            data_len = ntohl(*(u_int32_t *)buffer);
+            data_len = ntohl(*(uint32_t *)buffer);
 
         DPRINTF(Ethernet, "Received data from peer: len=%d buffer_offset=%d "
                 "data_len=%d\n", len, buffer_offset, data_len);
     }
 
-    while (data_len != 0 && buffer_offset >= data_len + sizeof(u_int32_t)) {
+    while (data_len != 0 && buffer_offset >= data_len + sizeof(uint32_t)) {
         EthPacketPtr packet;
         packet = new EthPacketData(data_len);
         packet->length = data_len;
         memcpy(packet->data, data, data_len);
 
-        buffer_offset -= data_len + sizeof(u_int32_t);
+        buffer_offset -= data_len + sizeof(uint32_t);
         assert(buffer_offset >= 0);
         if (buffer_offset > 0) {
             memmove(buffer, data + data_len, buffer_offset);
-            data_len = ntohl(*(u_int32_t *)buffer);
+            data_len = ntohl(*(uint32_t *)buffer);
         } else
             data_len = 0;
 
diff --git a/src/dev/ide_atareg.h b/src/dev/ide_atareg.h
index 5320529c8..df16d09d5 100644
--- a/src/dev/ide_atareg.h
+++ b/src/dev/ide_atareg.h
@@ -35,11 +35,25 @@
 
 #if defined(linux)
 #include <endian.h>
+#elif defined(__sun__)
+#include <sys/isa_defs.h>
 #else
 #include <machine/endian.h>
 #endif
 
+#ifdef LITTLE_ENDIAN
 #define ATA_BYTE_ORDER LITTLE_ENDIAN
+#elif defined(BIG_ENDIAN)
+#define ATA_BYTE_ORDER BIG_ENDIAN
+#elif defined(_LITTLE_ENDIAN)
+#define ATA_BYTE_ORDER 1
+#define LITTLE_ENDIAN 1
+#elif defined(_BIG_ENDIAN)
+#define ATA_BYTE_ORDER 0
+#define LITTLE_ENDIAN 1
+#else
+#error "No endianess defined"
+#endif
 
 /*
  * Drive parameter structure for ATA/ATAPI.
diff --git a/src/dev/pcidev.cc b/src/dev/pcidev.cc
index 383fc494f..1c2465dd1 100644
--- a/src/dev/pcidev.cc
+++ b/src/dev/pcidev.cc
@@ -387,33 +387,33 @@ CREATE_SIM_OBJECT(PciConfigData)
 {
     PciConfigData *data = new PciConfigData(getInstanceName());
 
-    data->config.vendor = htole(VendorID);
-    data->config.device = htole(DeviceID);
-    data->config.command = htole(Command);
-    data->config.status = htole(Status);
-    data->config.revision = htole(Revision);
-    data->config.progIF = htole(ProgIF);
-    data->config.subClassCode = htole(SubClassCode);
-    data->config.classCode = htole(ClassCode);
-    data->config.cacheLineSize = htole(CacheLineSize);
-    data->config.latencyTimer = htole(LatencyTimer);
-    data->config.headerType = htole(HeaderType);
-    data->config.bist = htole(BIST);
-
-    data->config.baseAddr[0] = htole(BAR0);
-    data->config.baseAddr[1] = htole(BAR1);
-    data->config.baseAddr[2] = htole(BAR2);
-    data->config.baseAddr[3] = htole(BAR3);
-    data->config.baseAddr[4] = htole(BAR4);
-    data->config.baseAddr[5] = htole(BAR5);
-    data->config.cardbusCIS = htole(CardbusCIS);
-    data->config.subsystemVendorID = htole(SubsystemVendorID);
-    data->config.subsystemID = htole(SubsystemID);
-    data->config.expansionROM = htole(ExpansionROM);
-    data->config.interruptLine = htole(InterruptLine);
-    data->config.interruptPin = htole(InterruptPin);
-    data->config.minimumGrant = htole(MinimumGrant);
-    data->config.maximumLatency = htole(MaximumLatency);
+    data->config.vendor = htole(VendorID.returnValue());
+    data->config.device = htole(DeviceID.returnValue());
+    data->config.command = htole(Command.returnValue());
+    data->config.status = htole(Status.returnValue());
+    data->config.revision = htole(Revision.returnValue());
+    data->config.progIF = htole(ProgIF.returnValue());
+    data->config.subClassCode = htole(SubClassCode.returnValue());
+    data->config.classCode = htole(ClassCode.returnValue());
+    data->config.cacheLineSize = htole(CacheLineSize.returnValue());
+    data->config.latencyTimer = htole(LatencyTimer.returnValue());
+    data->config.headerType = htole(HeaderType.returnValue());
+    data->config.bist = htole(BIST.returnValue());
+
+    data->config.baseAddr[0] = htole(BAR0.returnValue());
+    data->config.baseAddr[1] = htole(BAR1.returnValue());
+    data->config.baseAddr[2] = htole(BAR2.returnValue());
+    data->config.baseAddr[3] = htole(BAR3.returnValue());
+    data->config.baseAddr[4] = htole(BAR4.returnValue());
+    data->config.baseAddr[5] = htole(BAR5.returnValue());
+    data->config.cardbusCIS = htole(CardbusCIS.returnValue());
+    data->config.subsystemVendorID = htole(SubsystemVendorID.returnValue());
+    data->config.subsystemID = htole(SubsystemID.returnValue());
+    data->config.expansionROM = htole(ExpansionROM.returnValue());
+    data->config.interruptLine = htole(InterruptLine.returnValue());
+    data->config.interruptPin = htole(InterruptPin.returnValue());
+    data->config.minimumGrant = htole(MinimumGrant.returnValue());
+    data->config.maximumLatency = htole(MaximumLatency.returnValue());
 
     data->BARSize[0] = BAR0Size;
     data->BARSize[1] = BAR1Size;
diff --git a/src/mem/cache/base_cache.cc b/src/mem/cache/base_cache.cc
index c26d7782b..c16cb6945 100644
--- a/src/mem/cache/base_cache.cc
+++ b/src/mem/cache/base_cache.cc
@@ -102,21 +102,56 @@ BaseCache::CachePort::recvAtomic(PacketPtr pkt)
     return cache->doAtomicAccess(pkt, isCpuSide);
 }
 
-void
-BaseCache::CachePort::recvFunctional(PacketPtr pkt)
+bool
+BaseCache::CachePort::checkFunctional(PacketPtr pkt)
 {
     //Check storage here first
     list<PacketPtr>::iterator i = drainList.begin();
-    list<PacketPtr>::iterator end = drainList.end();
-    for (; i != end; ++i) {
+    list<PacketPtr>::iterator iend = drainList.end();
+    bool notDone = true;
+    while (i != iend && notDone) {
         PacketPtr target = *i;
         // If the target contains data, and it overlaps the
         // probed request, need to update data
         if (target->intersect(pkt)) {
-            fixPacket(pkt, target);
+            DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a drain\n",
+                    pkt->cmdString(), pkt->getAddr() & ~(cache->getBlockSize() - 1));
+            notDone = fixPacket(pkt, target);
         }
+        i++;
     }
-    cache->doFunctionalAccess(pkt, isCpuSide);
+    //Also check the response not yet ready to be on the list
+    std::list<std::pair<Tick,PacketPtr> >::iterator j = transmitList.begin();
+    std::list<std::pair<Tick,PacketPtr> >::iterator jend = transmitList.end();
+
+    while (j != jend && notDone) {
+        PacketPtr target = j->second;
+        // If the target contains data, and it overlaps the
+        // probed request, need to update data
+        if (target->intersect(pkt)) {
+            DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a response\n",
+                    pkt->cmdString(), pkt->getAddr() & ~(cache->getBlockSize() - 1));
+            notDone = fixDelayedResponsePacket(pkt, target);
+        }
+        j++;
+    }
+    return notDone;
+}
+
+void
+BaseCache::CachePort::recvFunctional(PacketPtr pkt)
+{
+    bool notDone = checkFunctional(pkt);
+    if (notDone)
+        cache->doFunctionalAccess(pkt, isCpuSide);
+}
+
+void
+BaseCache::CachePort::checkAndSendFunctional(PacketPtr pkt)
+{
+    bool notDone = checkFunctional(pkt);
+    if (notDone)
+        sendFunctional(pkt);
 }
 
 void
@@ -135,7 +170,7 @@ BaseCache::CachePort::recvRetry()
                 isCpuSide && cache->doSlaveRequest()) {
 
                 DPRINTF(CachePort, "%s has more responses/requests\n", name());
-                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this, false);
                 reqCpu->schedule(curTick + 1);
             }
             waitingOnRetry = false;
@@ -176,7 +211,7 @@ BaseCache::CachePort::recvRetry()
         {
             DPRINTF(CachePort, "%s has more requests\n", name());
             //Still more to issue, rerequest in 1 cycle
-            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this, false);
             reqCpu->schedule(curTick + 1);
         }
     }
@@ -194,7 +229,7 @@ BaseCache::CachePort::recvRetry()
         {
             DPRINTF(CachePort, "%s has more requests\n", name());
             //Still more to issue, rerequest in 1 cycle
-            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this, false);
             reqCpu->schedule(curTick + 1);
         }
     }
@@ -226,23 +261,19 @@ BaseCache::CachePort::clearBlocked()
     }
 }
 
-BaseCache::CacheEvent::CacheEvent(CachePort *_cachePort)
-    : Event(&mainEventQueue, CPU_Tick_Pri), cachePort(_cachePort)
+BaseCache::CacheEvent::CacheEvent(CachePort *_cachePort, bool _newResponse)
+    : Event(&mainEventQueue, CPU_Tick_Pri), cachePort(_cachePort),
+      newResponse(_newResponse)
 {
-    this->setFlags(AutoDelete);
+    if (!newResponse)
+        this->setFlags(AutoDelete);
     pkt = NULL;
 }
 
-BaseCache::CacheEvent::CacheEvent(CachePort *_cachePort, PacketPtr _pkt)
-    : Event(&mainEventQueue, CPU_Tick_Pri), cachePort(_cachePort), pkt(_pkt)
-{
-    this->setFlags(AutoDelete);
-}
-
 void
 BaseCache::CacheEvent::process()
 {
-    if (!pkt)
+    if (!newResponse)
     {
         if (cachePort->waitingOnRetry) return;
        //We have some responses to drain first
@@ -322,8 +353,16 @@ BaseCache::CacheEvent::process()
         }
         return;
     }
-    //Response
-    //Know the packet to send
+    //Else it's a response
+    assert(cachePort->transmitList.size());
+    assert(cachePort->transmitList.front().first <= curTick);
+    pkt = cachePort->transmitList.front().second;
+    cachePort->transmitList.pop_front();
+    if (!cachePort->transmitList.empty()) {
+        Tick time = cachePort->transmitList.front().first;
+        schedule(time <= curTick ? curTick+1 : time);
+    }
+
     if (pkt->flags & NACKED_LINE)
         pkt->result = Packet::Nacked;
     else
@@ -343,7 +382,7 @@ BaseCache::CacheEvent::process()
     }
 
     // Check if we're done draining once this list is empty
-    if (cachePort->drainList.empty())
+    if (cachePort->drainList.empty() && cachePort->transmitList.empty())
         cachePort->cache->checkDrain();
 }
 
@@ -358,8 +397,10 @@ BaseCache::getPort(const std::string &if_name, int idx)
 {
     if (if_name == "")
     {
-        if(cpuSidePort == NULL)
+        if(cpuSidePort == NULL) {
             cpuSidePort = new CachePort(name() + "-cpu_side_port", this, true);
+            sendEvent = new CacheEvent(cpuSidePort, true);
+        }
         return cpuSidePort;
     }
     else if (if_name == "functional")
@@ -368,8 +409,10 @@ BaseCache::getPort(const std::string &if_name, int idx)
     }
     else if (if_name == "cpu_side")
     {
-        if(cpuSidePort == NULL)
+        if(cpuSidePort == NULL) {
             cpuSidePort = new CachePort(name() + "-cpu_side_port", this, true);
+            sendEvent = new CacheEvent(cpuSidePort, true);
+        }
         return cpuSidePort;
     }
     else if (if_name == "mem_side")
@@ -377,6 +420,7 @@ BaseCache::getPort(const std::string &if_name, int idx)
         if (memSidePort != NULL)
             panic("Already have a mem side for this cache\n");
         memSidePort = new CachePort(name() + "-mem_side_port", this, false);
+        memSendEvent = new CacheEvent(memSidePort, true);
         return memSidePort;
     }
     else panic("Port name %s unrecognized\n", if_name);
diff --git a/src/mem/cache/base_cache.hh b/src/mem/cache/base_cache.hh
index ea7544fbb..584c2d5df 100644
--- a/src/mem/cache/base_cache.hh
+++ b/src/mem/cache/base_cache.hh
@@ -105,7 +105,11 @@ class BaseCache : public MemObject
 
         void clearBlocked();
 
-        bool canDrain() { return drainList.empty(); }
+        bool checkFunctional(PacketPtr pkt);
+
+        void checkAndSendFunctional(PacketPtr pkt);
+
+        bool canDrain() { return drainList.empty() && transmitList.empty(); }
 
         bool blocked;
 
@@ -117,15 +121,16 @@ class BaseCache : public MemObject
 
         std::list<PacketPtr> drainList;
 
+        std::list<std::pair<Tick,PacketPtr> > transmitList;
     };
 
     struct CacheEvent : public Event
     {
         CachePort *cachePort;
         PacketPtr pkt;
+        bool newResponse;
 
-        CacheEvent(CachePort *_cachePort);
-        CacheEvent(CachePort *_cachePort, PacketPtr _pkt);
+        CacheEvent(CachePort *_cachePort, bool response);
         void process();
         const char *description();
     };
@@ -133,6 +138,9 @@ class BaseCache : public MemObject
   public: //Made public so coherence can get at it.
     CachePort *cpuSidePort;
 
+    CacheEvent *sendEvent;
+    CacheEvent *memSendEvent;
+
   protected:
     CachePort *memSidePort;
 
@@ -353,6 +361,12 @@ class BaseCache : public MemObject
         snoopRangesSent = false;
     }
 
+    ~BaseCache()
+    {
+        delete sendEvent;
+        delete memSendEvent;
+    }
+
     virtual void init();
 
     /**
@@ -467,7 +481,8 @@ class BaseCache : public MemObject
     {
         if (!doMasterRequest() && !memSidePort->waitingOnRetry)
         {
-            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(memSidePort);
+            BaseCache::CacheEvent * reqCpu =
+                new BaseCache::CacheEvent(memSidePort, false);
             reqCpu->schedule(time);
         }
         uint8_t flag = 1<<cause;
@@ -503,7 +518,8 @@ class BaseCache : public MemObject
     {
         if (!doSlaveRequest() && !cpuSidePort->waitingOnRetry)
         {
-            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(cpuSidePort);
+            BaseCache::CacheEvent * reqCpu =
+                new BaseCache::CacheEvent(cpuSidePort, false);
             reqCpu->schedule(time);
         }
         uint8_t flag = 1<<cause;
@@ -528,9 +544,44 @@ class BaseCache : public MemObject
      */
     void respond(PacketPtr pkt, Tick time)
     {
+        assert(time >= curTick);
         if (pkt->needsResponse()) {
-            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+/*            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
             reqCpu->schedule(time);
+*/
+            if (cpuSidePort->transmitList.empty()) {
+                assert(!sendEvent->scheduled());
+                sendEvent->schedule(time);
+                cpuSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                    (time,pkt));
+                return;
+            }
+
+            // something is on the list and this belongs at the end
+            if (time >= cpuSidePort->transmitList.back().first) {
+                cpuSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                    (time,pkt));
+                return;
+            }
+            // Something is on the list and this belongs somewhere else
+            std::list<std::pair<Tick,PacketPtr> >::iterator i =
+                cpuSidePort->transmitList.begin();
+            std::list<std::pair<Tick,PacketPtr> >::iterator end =
+                cpuSidePort->transmitList.end();
+            bool done = false;
+
+            while (i != end && !done) {
+                if (time < i->first) {
+                    if (i == cpuSidePort->transmitList.begin()) {
+                        //Inserting at begining, reschedule
+                        sendEvent->reschedule(time);
+                    }
+                    cpuSidePort->transmitList.insert(i,std::pair<Tick,PacketPtr>
+                                                     (time,pkt));
+                    done = true;
+                }
+                i++;
+            }
         }
         else {
             if (pkt->cmd != Packet::UpgradeReq)
@@ -548,12 +599,48 @@ class BaseCache : public MemObject
      */
     void respondToMiss(PacketPtr pkt, Tick time)
     {
+        assert(time >= curTick);
         if (!pkt->req->isUncacheable()) {
-            missLatency[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] += time - pkt->time;
+            missLatency[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] +=
+                time - pkt->time;
         }
         if (pkt->needsResponse()) {
-            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+/*            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
             reqCpu->schedule(time);
+*/
+            if (cpuSidePort->transmitList.empty()) {
+                assert(!sendEvent->scheduled());
+                sendEvent->schedule(time);
+                cpuSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                    (time,pkt));
+                return;
+            }
+
+            // something is on the list and this belongs at the end
+            if (time >= cpuSidePort->transmitList.back().first) {
+                cpuSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                    (time,pkt));
+                return;
+            }
+            // Something is on the list and this belongs somewhere else
+            std::list<std::pair<Tick,PacketPtr> >::iterator i =
+                cpuSidePort->transmitList.begin();
+            std::list<std::pair<Tick,PacketPtr> >::iterator end =
+                cpuSidePort->transmitList.end();
+            bool done = false;
+
+            while (i != end && !done) {
+                if (time < i->first) {
+                    if (i == cpuSidePort->transmitList.begin()) {
+                        //Inserting at begining, reschedule
+                        sendEvent->reschedule(time);
+                    }
+                    cpuSidePort->transmitList.insert(i,std::pair<Tick,PacketPtr>
+                                                     (time,pkt));
+                    done = true;
+                }
+                i++;
+            }
         }
         else {
             if (pkt->cmd != Packet::UpgradeReq)
@@ -570,9 +657,43 @@ class BaseCache : public MemObject
      */
     void respondToSnoop(PacketPtr pkt, Tick time)
     {
+        assert(time >= curTick);
         assert (pkt->needsResponse());
-        CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
+/*        CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
         reqMem->schedule(time);
+*/
+        if (memSidePort->transmitList.empty()) {
+            assert(!memSendEvent->scheduled());
+            memSendEvent->schedule(time);
+            memSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                (time,pkt));
+            return;
+        }
+
+        // something is on the list and this belongs at the end
+        if (time >= memSidePort->transmitList.back().first) {
+            memSidePort->transmitList.push_back(std::pair<Tick,PacketPtr>
+                                                (time,pkt));
+            return;
+        }
+        // Something is on the list and this belongs somewhere else
+        std::list<std::pair<Tick,PacketPtr> >::iterator i =
+            memSidePort->transmitList.begin();
+        std::list<std::pair<Tick,PacketPtr> >::iterator end =
+            memSidePort->transmitList.end();
+        bool done = false;
+
+        while (i != end && !done) {
+            if (time < i->first) {
+                if (i == memSidePort->transmitList.begin()) {
+                    //Inserting at begining, reschedule
+                    memSendEvent->reschedule(time);
+                }
+                memSidePort->transmitList.insert(i,std::pair<Tick,PacketPtr>(time,pkt));
+                done = true;
+            }
+            i++;
+        }
     }
 
     /**
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index 9bb72e85c..df59b0a4f 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -53,6 +53,8 @@
 
 #include "sim/sim_exit.hh" // for SimExitEvent
 
+bool SIGNAL_NACK_HACK;
+
 template<class TagStore, class Buffering, class Coherence>
 bool
 Cache<TagStore,Buffering,Coherence>::
@@ -242,6 +244,11 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
         missQueue->handleMiss(pkt, size, curTick + hitLatency);
     }
 
+    if (pkt->cmd == Packet::Writeback) {
+        //Need to clean up the packet on a writeback miss, but leave the request
+        delete pkt;
+    }
+
     return true;
 }
 
@@ -265,6 +272,7 @@ Cache<TagStore,Buffering,Coherence>::getPacket()
 
     assert(!doMasterRequest() || missQueue->havePending());
     assert(!pkt || pkt->time <= curTick);
+    SIGNAL_NACK_HACK = false;
     return pkt;
 }
 
@@ -273,16 +281,15 @@ void
 Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr,
                                                 bool success)
 {
-    if (success && !(pkt && (pkt->flags & NACKED_LINE))) {
-        if (!mshr->pkt->needsResponse()
-            && !(mshr->pkt->cmd == Packet::UpgradeReq)
-            && (pkt && (pkt->flags & SATISFIED))) {
-            //Writeback, clean up the non copy version of the packet
-            delete pkt;
-        }
+    if (success && !(SIGNAL_NACK_HACK)) {
+        //Remember if it was an upgrade because writeback MSHR's are removed
+        //in Mark in Service
+        bool upgrade = (mshr->pkt && mshr->pkt->cmd == Packet::UpgradeReq);
+
         missQueue->markInService(mshr->pkt, mshr);
+
         //Temp Hack for UPGRADES
-        if (mshr->pkt && mshr->pkt->cmd == Packet::UpgradeReq) {
+        if (upgrade) {
             assert(pkt);  //Upgrades need to be fixed
             pkt->flags &= ~CACHE_LINE_FILL;
             BlkType *blk = tags->findBlock(pkt);
@@ -300,6 +307,7 @@ Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr,
         }
     } else if (pkt && !pkt->req->isUncacheable()) {
         pkt->flags &= ~NACKED_LINE;
+        SIGNAL_NACK_HACK = false;
         pkt->flags &= ~SATISFIED;
         pkt->flags &= ~SNOOP_COMMIT;
 
@@ -333,6 +341,8 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(PacketPtr &pkt)
         DPRINTF(Cache, "Handling reponse to %x\n", pkt->getAddr());
 
         if (pkt->isCacheFill() && !pkt->isNoAllocate()) {
+            DPRINTF(Cache, "Block for addr %x being updated in Cache\n",
+                    pkt->getAddr());
             blk = tags->findBlock(pkt);
             CacheBlk::State old_state = (blk) ? blk->status : 0;
             PacketList writebacks;
@@ -402,6 +412,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(PacketPtr &pkt)
                     assert(!(pkt->flags & SATISFIED));
                     pkt->flags |= SATISFIED;
                     pkt->flags |= NACKED_LINE;
+                    SIGNAL_NACK_HACK = true;
                     ///@todo NACK's from other levels
                     //warn("NACKs from devices not connected to the same bus "
                     //"not implemented\n");
@@ -474,6 +485,13 @@ Cache<TagStore,Buffering,Coherence>::snoop(PacketPtr &pkt)
     }
     CacheBlk::State new_state;
     bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
+
+    if (blk && mshr && !mshr->inService && new_state == 0) {
+            //There was a outstanding write to a shared block, not need ReadEx
+            //not update, so change No Allocate param in MSHR
+            mshr->pkt->flags &= ~NO_ALLOCATE;
+    }
+
     if (satisfy) {
         DPRINTF(Cache, "Cache snooped a %s request for addr %x and "
                 "now supplying data, new state is %i\n",
@@ -486,6 +504,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(PacketPtr &pkt)
     if (blk)
         DPRINTF(Cache, "Cache snooped a %s request for addr %x, "
                 "new state is %i\n", pkt->cmdString(), blk_addr, new_state);
+
     tags->handleSnoop(blk, new_state);
 }
 
@@ -534,9 +553,9 @@ Cache<TagStore,Buffering,Coherence>::probe(PacketPtr &pkt, bool update,
         }
     }
 
-    if (!update && (pkt->isWrite() || (otherSidePort == cpuSidePort))) {
+    if (!update && (otherSidePort == cpuSidePort)) {
         // Still need to change data in all locations.
-        otherSidePort->sendFunctional(pkt);
+        otherSidePort->checkAndSendFunctional(pkt);
         if (pkt->isRead() && pkt->result == Packet::Success)
             return 0;
     }
@@ -560,30 +579,33 @@ Cache<TagStore,Buffering,Coherence>::probe(PacketPtr &pkt, bool update,
     missQueue->findWrites(blk_addr, writes);
 
     if (!update) {
+        bool notDone = !(pkt->flags & SATISFIED); //Hit in cache (was a block)
         // Check for data in MSHR and writebuffer.
         if (mshr) {
             MSHR::TargetList *targets = mshr->getTargetList();
             MSHR::TargetList::iterator i = targets->begin();
             MSHR::TargetList::iterator end = targets->end();
-            for (; i != end; ++i) {
+            for (; i != end && notDone; ++i) {
                 PacketPtr target = *i;
                 // If the target contains data, and it overlaps the
                 // probed request, need to update data
                 if (target->intersect(pkt)) {
-                    fixPacket(pkt, target);
+                    DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a MSHR\n",
+                            pkt->cmdString(), blk_addr);
+                    notDone = fixPacket(pkt, target);
                 }
             }
         }
-        for (int i = 0; i < writes.size(); ++i) {
+        for (int i = 0; i < writes.size() && notDone; ++i) {
             PacketPtr write = writes[i]->pkt;
             if (write->intersect(pkt)) {
-                fixPacket(pkt, write);
+                DPRINTF(Cache, "Functional %s access to blk_addr %x intersects a writeback\n",
+                        pkt->cmdString(), blk_addr);
+                notDone = fixPacket(pkt, write);
             }
         }
-        if (pkt->isRead()
-            && pkt->result != Packet::Success
-            && otherSidePort == memSidePort) {
-            otherSidePort->sendFunctional(pkt);
+        if (notDone && otherSidePort == memSidePort) {
+            otherSidePort->checkAndSendFunctional(pkt);
             assert(pkt->result == Packet::Success);
         }
         return 0;
diff --git a/src/mem/cache/miss/mshr_queue.cc b/src/mem/cache/miss/mshr_queue.cc
index d3a7a7933..6cb62429d 100644
--- a/src/mem/cache/miss/mshr_queue.cc
+++ b/src/mem/cache/miss/mshr_queue.cc
@@ -198,11 +198,6 @@ MSHRQueue::markInService(MSHR* mshr)
     //assert(mshr == pendingList.front());
     if (!mshr->pkt->needsResponse() && !(mshr->pkt->cmd == Packet::UpgradeReq)) {
         assert(mshr->getNumTargets() == 0);
-        if ((mshr->pkt->flags & SATISFIED) && (mshr->pkt->cmd == Packet::Writeback)) {
-            //Writeback hit, so delete it
-            //otherwise the consumer will delete it
-            delete mshr->pkt->req;
-        }
         deallocate(mshr);
         return;
     }
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index a342af634..e2faf4527 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -144,6 +144,24 @@ Packet::intersect(PacketPtr p)
 }
 
 bool
+fixDelayedResponsePacket(PacketPtr func, PacketPtr timing)
+{
+    bool result;
+
+    if (timing->isRead() || timing->isWrite()) {
+        timing->toggleData();
+        result = fixPacket(func, timing);
+        timing->toggleData();
+    }
+    else {
+        //Don't toggle if it isn't a read/write response
+        result = fixPacket(func, timing);
+    }
+
+    return result;
+}
+
+bool
 fixPacket(PacketPtr func, PacketPtr timing)
 {
     Addr funcStart      = func->getAddr();
@@ -168,6 +186,7 @@ fixPacket(PacketPtr func, PacketPtr timing)
             memcpy(func->getPtr<uint8_t>(), timing->getPtr<uint8_t>() +
                     funcStart - timingStart, func->getSize());
             func->result = Packet::Success;
+            func->flags |= SATISFIED;
             return false;
         } else {
             // In this case the timing packet only partially satisfies the
@@ -182,11 +201,11 @@ fixPacket(PacketPtr func, PacketPtr timing)
         if (funcStart >= timingStart) {
             memcpy(timing->getPtr<uint8_t>() + (funcStart - timingStart),
                    func->getPtr<uint8_t>(),
-                   std::min(funcEnd, timingEnd) - funcStart);
+                   (std::min(funcEnd, timingEnd) - funcStart) + 1);
         } else { // timingStart > funcStart
             memcpy(timing->getPtr<uint8_t>(),
                    func->getPtr<uint8_t>() + (timingStart - funcStart),
-                   std::min(funcEnd, timingEnd) - timingStart);
+                   (std::min(funcEnd, timingEnd) - timingStart) + 1);
         }
         // we always want to keep going with a write
         return true;
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index cb97dd036..2bc51bf12 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -344,6 +344,13 @@ class Packet
         srcValid = false;
     }
 
+
+    void toggleData() {
+        int icmd = (int)cmd;
+        icmd ^= HasData;
+        cmd = (Command)icmd;
+    }
+
     /**
      * Take a request packet and modify it in place to be suitable for
      * returning as a response to that request.
@@ -448,7 +455,6 @@ class Packet
     bool intersect(PacketPtr p);
 };
 
-
 /** This function given a functional packet and a timing packet either satisfies
  * the timing packet, or updates the timing packet to reflect the updated state
  * in the timing packet. It returns if the functional packet should continue to
@@ -456,6 +462,12 @@ class Packet
  */
 bool fixPacket(PacketPtr func, PacketPtr timing);
 
+/** This function is a wrapper for the fixPacket field that toggles the hasData bit
+ * it is used when a response is waiting in the caches, but hasn't been marked as a
+ * response yet (so the fixPacket needs to get the correct value for the hasData)
+ */
+bool fixDelayedResponsePacket(PacketPtr func, PacketPtr timing);
+
 std::ostream & operator<<(std::ostream &o, const Packet &p);
 
 #endif //__MEM_PACKET_HH
diff --git a/src/mem/physical.cc b/src/mem/physical.cc
index 39eb63108..94f60ad80 100644
--- a/src/mem/physical.cc
+++ b/src/mem/physical.cc
@@ -288,6 +288,21 @@ PhysicalMemory::MemoryPort::recvAtomic(PacketPtr pkt)
 void
 PhysicalMemory::MemoryPort::recvFunctional(PacketPtr pkt)
 {
+    //Since we are overriding the function, make sure to have the impl of the
+    //check or functional accesses here.
+    std::list<std::pair<Tick,PacketPtr> >::iterator i = transmitList.begin();
+    std::list<std::pair<Tick,PacketPtr> >::iterator end = transmitList.end();
+    bool notDone = true;
+
+    while (i != end && notDone) {
+        PacketPtr target = i->second;
+        // If the target contains data, and it overlaps the
+        // probed request, need to update data
+        if (target->intersect(pkt))
+            notDone = fixPacket(pkt, target);
+        i++;
+    }
+
     // Default implementation of SimpleTimingPort::recvFunctional()
     // calls recvAtomic() and throws away the latency; we can save a
     // little here by just not calculating the latency.
diff --git a/src/mem/tport.cc b/src/mem/tport.cc
index 086d91279..c43c9aac0 100644
--- a/src/mem/tport.cc
+++ b/src/mem/tport.cc
@@ -35,14 +35,14 @@ SimpleTimingPort::recvFunctional(PacketPtr pkt)
 {
     std::list<std::pair<Tick,PacketPtr> >::iterator i = transmitList.begin();
     std::list<std::pair<Tick,PacketPtr> >::iterator end = transmitList.end();
-    bool done = false;
+    bool notDone = true;
 
-    while (i != end && !done) {
+    while (i != end && notDone) {
         PacketPtr target = i->second;
         // If the target contains data, and it overlaps the
         // probed request, need to update data
         if (target->intersect(pkt))
-            done = fixPacket(pkt, target);
+            notDone = fixPacket(pkt, target);
 
         i++;
     }
@@ -118,8 +118,14 @@ SimpleTimingPort::sendTiming(PacketPtr pkt, Tick time)
     bool done = false;
 
     while (i != end && !done) {
-        if (time+curTick < i->first)
+        if (time+curTick < i->first) {
+            if (i == transmitList.begin()) {
+                //Inserting at begining, reschedule
+                sendEvent.reschedule(time+curTick);
+            }
             transmitList.insert(i,std::pair<Tick,PacketPtr>(time+curTick,pkt));
+            done = true;
+        }
         i++;
     }
 }
diff --git a/src/python/SConscript b/src/python/SConscript
index c9e713199..5c351c32a 100644
--- a/src/python/SConscript
+++ b/src/python/SConscript
@@ -98,12 +98,12 @@ pyzip_files.append('m5/defines.py')
 pyzip_files.append('m5/info.py')
 pyzip_files.append(join(env['ROOT'], 'util/pbs/jobfile.py'))
 
-env.Command(['swig/cc_main_wrap.cc', 'm5/cc_main.py'],
-            'swig/cc_main.i',
+env.Command(['swig/main_wrap.cc', 'm5/internal/main.py'],
+            'swig/main.i',
             '$SWIG $SWIGFLAGS -outdir ${TARGETS[1].dir} '
             '-o ${TARGETS[0]} $SOURCES')
 
-pyzip_dep_files.append('m5/cc_main.py')
+pyzip_dep_files.append('m5/internal/main.py')
 
 # Action function to build the zip archive.  Uses the PyZipFile module
 # included in the standard Python library.
diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index 18b3fff55..934358298 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -695,7 +695,7 @@ class SimObject(object):
     def getCCObject(self):
         if not self._ccObject:
             self._ccObject = -1 # flag to catch cycles in recursion
-            self._ccObject = cc_main.createSimObject(self.path())
+            self._ccObject = internal.main.createSimObject(self.path())
         elif self._ccObject == -1:
             raise RuntimeError, "%s: recursive call to getCCObject()" \
                   % self.path()
@@ -730,13 +730,13 @@ class SimObject(object):
             # i don't know if there's a better way to do this - calling
             # setMemoryMode directly from self._ccObject results in calling
             # SimObject::setMemoryMode, not the System::setMemoryMode
-            system_ptr = cc_main.convertToSystemPtr(self._ccObject)
+            system_ptr = internal.main.convertToSystemPtr(self._ccObject)
             system_ptr.setMemoryMode(mode)
         for child in self._children.itervalues():
             child.changeTiming(mode)
 
     def takeOverFrom(self, old_cpu):
-        cpu_ptr = cc_main.convertToBaseCPUPtr(old_cpu._ccObject)
+        cpu_ptr = internal.main.convertToBaseCPUPtr(old_cpu._ccObject)
         self._ccObject.takeOverFrom(cpu_ptr)
 
     # generate output file for 'dot' to display as a pretty graph.
@@ -795,8 +795,7 @@ def resolveSimObject(name):
 # short to avoid polluting other namespaces.
 __all__ = ['SimObject', 'ParamContext']
 
-
 # see comment on imports at end of __init__.py.
 import proxy
-import cc_main
+import internal
 import m5
diff --git a/src/python/m5/__init__.py b/src/python/m5/__init__.py
index 579562b38..f39cc670a 100644
--- a/src/python/m5/__init__.py
+++ b/src/python/m5/__init__.py
@@ -30,11 +30,11 @@
 import atexit, os, sys
 
 # import the SWIG-wrapped main C++ functions
-import cc_main
+import internal
 # import a few SWIG-wrapped items (those that are likely to be used
 # directly by user scripts) completely into this module for
 # convenience
-from cc_main import simulate, SimLoopExitEvent
+from internal.main import simulate, SimLoopExitEvent
 
 # import the m5 compile options
 import defines
@@ -85,10 +85,10 @@ def instantiate(root):
     root.print_ini()
     sys.stdout.close() # close config.ini
     sys.stdout = sys.__stdout__ # restore to original
-    cc_main.loadIniFile(resolveSimObject)  # load config.ini into C++
+    internal.main.loadIniFile(resolveSimObject)  # load config.ini into C++
     root.createCCObject()
     root.connectPorts()
-    cc_main.finalInit()
+    internal.main.finalInit()
     noDot = True # temporary until we fix dot
     if not noDot:
        dot = pydot.Dot()
@@ -102,10 +102,10 @@ def instantiate(root):
 
 # Export curTick to user script.
 def curTick():
-    return cc_main.cvar.curTick
+    return internal.main.cvar.curTick
 
 # register our C++ exit callback function with Python
-atexit.register(cc_main.doExitCleanup)
+atexit.register(internal.main.doExitCleanup)
 
 # This loops until all objects have been fully drained.
 def doDrain(root):
@@ -119,7 +119,7 @@ def doDrain(root):
 # be drained.
 def drain(root):
     all_drained = False
-    drain_event = cc_main.createCountedDrain()
+    drain_event = internal.main.createCountedDrain()
     unready_objects = root.startDrain(drain_event, True)
     # If we've got some objects that can't drain immediately, then simulate
     if unready_objects > 0:
@@ -127,7 +127,7 @@ def drain(root):
         simulate()
     else:
         all_drained = True
-    cc_main.cleanupCountedDrain(drain_event)
+    internal.main.cleanupCountedDrain(drain_event)
     return all_drained
 
 def resume(root):
@@ -138,12 +138,12 @@ def checkpoint(root, dir):
         raise TypeError, "Object is not a root object. Checkpoint must be called on a root object."
     doDrain(root)
     print "Writing checkpoint"
-    cc_main.serializeAll(dir)
+    internal.main.serializeAll(dir)
     resume(root)
 
 def restoreCheckpoint(root, dir):
     print "Restoring from checkpoint"
-    cc_main.unserializeAll(dir)
+    internal.main.unserializeAll(dir)
     resume(root)
 
 def changeToAtomic(system):
@@ -152,7 +152,7 @@ def changeToAtomic(system):
         "called on a root object."
     doDrain(system)
     print "Changing memory mode to atomic"
-    system.changeTiming(cc_main.SimObject.Atomic)
+    system.changeTiming(internal.main.SimObject.Atomic)
 
 def changeToTiming(system):
     if not isinstance(system, objects.Root) and not isinstance(system, objects.System):
@@ -160,7 +160,7 @@ def changeToTiming(system):
         "called on a root object."
     doDrain(system)
     print "Changing memory mode to timing"
-    system.changeTiming(cc_main.SimObject.Timing)
+    system.changeTiming(internal.main.SimObject.Timing)
 
 def switchCpus(cpuList):
     print "switching cpus"
@@ -180,7 +180,7 @@ def switchCpus(cpuList):
             raise TypeError, "%s is not of type BaseCPU" % cpu
 
     # Drain all of the individual CPUs
-    drain_event = cc_main.createCountedDrain()
+    drain_event = internal.main.createCountedDrain()
     unready_cpus = 0
     for old_cpu in old_cpus:
         unready_cpus += old_cpu.startDrain(drain_event, False)
@@ -188,7 +188,7 @@ def switchCpus(cpuList):
     if unready_cpus > 0:
         drain_event.setCount(unready_cpus)
         simulate()
-    cc_main.cleanupCountedDrain(drain_event)
+    internal.main.cleanupCountedDrain(drain_event)
     # Now all of the CPUs are ready to be switched out
     for old_cpu in old_cpus:
         old_cpu._ccObject.switchOut()
diff --git a/src/python/m5/main.py b/src/python/m5/main.py
index ef37f62ac..1e224c0cf 100644
--- a/src/python/m5/main.py
+++ b/src/python/m5/main.py
@@ -211,7 +211,7 @@ def parse_args():
     return opts,args
 
 def main():
-    import cc_main
+    import internal
 
     parse_args()
 
@@ -249,7 +249,7 @@ def main():
         print "M5 Simulator System"
         print brief_copyright
         print
-        print "M5 compiled %s" % cc_main.cvar.compileDate;
+        print "M5 compiled %s" % internal.main.cvar.compileDate;
         print "M5 started %s" % datetime.now().ctime()
         print "M5 executing on %s" % socket.gethostname()
         print "command line:",
@@ -264,7 +264,7 @@ def main():
         usage(2)
 
     # tell C++ about output directory
-    cc_main.setOutputDir(options.outdir)
+    internal.main.setOutputDir(options.outdir)
 
     # update the system path with elements from the -p option
     sys.path[0:0] = options.path
diff --git a/src/python/m5/objects/BaseCPU.py b/src/python/m5/objects/BaseCPU.py
index b6e05627d..2f702a4bf 100644
--- a/src/python/m5/objects/BaseCPU.py
+++ b/src/python/m5/objects/BaseCPU.py
@@ -15,6 +15,12 @@ class BaseCPU(SimObject):
     cpu_id = Param.Int("CPU identifier")
 
     if build_env['FULL_SYSTEM']:
+        do_quiesce = Param.Bool(True, "enable quiesce instructions")
+        do_checkpoint_insts = Param.Bool(True,
+            "enable checkpoint pseudo instructions")
+        do_statistics_insts = Param.Bool(True,
+            "enable statistics pseudo instructions")
+
         if build_env['TARGET_ISA'] == 'sparc':
             dtb = Param.SparcDTB(SparcDTB(), "Data TLB")
             itb = Param.SparcITB(SparcITB(), "Instruction TLB")
diff --git a/src/python/m5/params.py b/src/python/m5/params.py
index 4b5953bcb..9e5f985c3 100644
--- a/src/python/m5/params.py
+++ b/src/python/m5/params.py
@@ -830,8 +830,9 @@ class PortRef(object):
         if self.ccConnected: # already done this
             return
         peer = self.peer
-        cc_main.connectPorts(self.simobj.getCCObject(), self.name, self.index,
-                             peer.simobj.getCCObject(), peer.name, peer.index)
+        internal.main.connectPorts(self.simobj.getCCObject(), self.name,
+                                   self.index, peer.simobj.getCCObject(),
+                                   peer.name, peer.index)
         self.ccConnected = True
         peer.ccConnected = True
 
@@ -970,4 +971,4 @@ __all__ = ['Param', 'VectorParam',
 from SimObject import isSimObject, isSimObjectSequence, isSimObjectClass
 import proxy
 import objects
-import cc_main
+import internal
diff --git a/src/sim/main.cc b/src/sim/main.cc
index 5b44102a8..6037283a4 100644
--- a/src/sim/main.cc
+++ b/src/sim/main.cc
@@ -117,7 +117,9 @@ abortHandler(int sigtype)
 #endif
 }
 
-extern "C" { void init_cc_main(); }
+extern "C" {
+void init_main();
+}
 
 int
 main(int argc, char **argv)
@@ -155,8 +157,8 @@ main(int argc, char **argv)
     Py_Initialize();
     PySys_SetArgv(argc, argv);
 
-    // initialize SWIG 'cc_main' module
-    init_cc_main();
+    // initialize SWIG 'm5.internal.main' module
+    init_main();
 
     PyRun_SimpleString("import m5.main");
     PyRun_SimpleString("m5.main.main()");
diff --git a/src/sim/param.hh b/src/sim/param.hh
index 1bc55c125..2aa0456da 100644
--- a/src/sim/param.hh
+++ b/src/sim/param.hh
@@ -242,6 +242,8 @@ class Param : public BaseParam
         return value;
     }
 
+    T returnValue() const { return value; }
+
     // display value to stream
     virtual void showValue(std::ostream &os) const;
 
diff --git a/src/sim/pseudo_inst.cc b/src/sim/pseudo_inst.cc
index 66036def1..4a8c0eb66 100644
--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@@ -40,7 +40,6 @@
 #include "cpu/thread_context.hh"
 #include "cpu/quiesce_event.hh"
 #include "arch/kernel_stats.hh"
-#include "sim/param.hh"
 #include "sim/pseudo_inst.hh"
 #include "sim/serialize.hh"
 #include "sim/sim_exit.hh"
@@ -57,10 +56,6 @@ using namespace TheISA;
 
 namespace AlphaPseudo
 {
-    bool doStatisticsInsts;
-    bool doCheckpointInsts;
-    bool doQuiesce;
-
     void
     arm(ThreadContext *tc)
     {
@@ -71,7 +66,7 @@ namespace AlphaPseudo
     void
     quiesce(ThreadContext *tc)
     {
-        if (!doQuiesce)
+        if (!tc->getCpuPtr()->params->do_quiesce)
             return;
 
         DPRINTF(Quiesce, "%s: quiesce()\n", tc->getCpuPtr()->name());
@@ -84,7 +79,7 @@ namespace AlphaPseudo
     void
     quiesceNs(ThreadContext *tc, uint64_t ns)
     {
-        if (!doQuiesce || ns == 0)
+        if (!tc->getCpuPtr()->params->do_quiesce || ns == 0)
             return;
 
         EndQuiesceEvent *quiesceEvent = tc->getQuiesceEvent();
@@ -107,7 +102,7 @@ namespace AlphaPseudo
     void
     quiesceCycles(ThreadContext *tc, uint64_t cycles)
     {
-        if (!doQuiesce || cycles == 0)
+        if (!tc->getCpuPtr()->params->do_quiesce || cycles == 0)
             return;
 
         EndQuiesceEvent *quiesceEvent = tc->getQuiesceEvent();
@@ -197,7 +192,7 @@ namespace AlphaPseudo
     void
     resetstats(ThreadContext *tc, Tick delay, Tick period)
     {
-        if (!doStatisticsInsts)
+        if (!tc->getCpuPtr()->params->do_statistics_insts)
             return;
 
 
@@ -211,7 +206,7 @@ namespace AlphaPseudo
     void
     dumpstats(ThreadContext *tc, Tick delay, Tick period)
     {
-        if (!doStatisticsInsts)
+        if (!tc->getCpuPtr()->params->do_statistics_insts)
             return;
 
 
@@ -252,7 +247,7 @@ namespace AlphaPseudo
     void
     dumpresetstats(ThreadContext *tc, Tick delay, Tick period)
     {
-        if (!doStatisticsInsts)
+        if (!tc->getCpuPtr()->params->do_statistics_insts)
             return;
 
 
@@ -266,7 +261,7 @@ namespace AlphaPseudo
     void
     m5checkpoint(ThreadContext *tc, Tick delay, Tick period)
     {
-        if (!doCheckpointInsts)
+        if (!tc->getCpuPtr()->params->do_checkpoint_insts)
             return;
 
         Tick when = curTick + delay * Clock::Int::ns;
@@ -278,7 +273,7 @@ namespace AlphaPseudo
     uint64_t
     readfile(ThreadContext *tc, Addr vaddr, uint64_t len, uint64_t offset)
     {
-        const string &file = tc->getCpuPtr()->system->params()->readfile;
+        const string &file = tc->getSystemPtr()->params()->readfile;
         if (file.empty()) {
             return ULL(0);
         }
@@ -310,33 +305,6 @@ namespace AlphaPseudo
         return result;
     }
 
-    class Context : public ParamContext
-    {
-      public:
-        Context(const string &section) : ParamContext(section) {}
-        void checkParams();
-    };
-
-    Context context("pseudo_inst");
-
-    Param<bool> __quiesce(&context, "quiesce",
-                          "enable quiesce instructions",
-                          true);
-    Param<bool> __statistics(&context, "statistics",
-                             "enable statistics pseudo instructions",
-                             true);
-    Param<bool> __checkpoint(&context, "checkpoint",
-                             "enable checkpoint pseudo instructions",
-                             true);
-
-    void
-    Context::checkParams()
-    {
-        doQuiesce = __quiesce;
-        doStatisticsInsts = __statistics;
-        doCheckpointInsts = __checkpoint;
-    }
-
     void debugbreak(ThreadContext *tc)
     {
         debug_break();
diff --git a/tests/long/00.gzip/test.py b/tests/long/00.gzip/test.py
index 7a74a0b0a..5c33376bd 100644
--- a/tests/long/00.gzip/test.py
+++ b/tests/long/00.gzip/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'gzip smred.log 1',
-                                       executable = binpath('gzip'))
+process = LiveProcess(executable = binpath('gzip'))
+process.cmd = 'gzip ' + inputpath('gzip', 'smred.log') + ' 1'
+root.system.cpu.workload = process
diff --git a/tests/long/10.mcf/test.py b/tests/long/10.mcf/test.py
index af2536c7e..36d077c96 100644
--- a/tests/long/10.mcf/test.py
+++ b/tests/long/10.mcf/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'mcf lgred.in',
-                                       executable = binpath('mcf'))
+process = LiveProcess(executable = binpath('mcf'))
+process.cmd = 'mcf' + inputpath('mcf', 'lgred.in')
+root.system.cpu.workload = process
diff --git a/tests/long/20.parser/test.py b/tests/long/20.parser/test.py
index 0b142db25..760908722 100644
--- a/tests/long/20.parser/test.py
+++ b/tests/long/20.parser/test.py
@@ -26,5 +26,7 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'parser 2.1.dict -batch < lgred.in',
-                                       executable = binpath('parser'))
+process = LiveProcess(executable = binpath('parser'))
+process.cmd = 'parser 2.1.dict -batch'
+process.input = inputpath('parser', 'lgred.in')
+root.system.cpu.workload = process
diff --git a/tests/long/30.eon/test.py b/tests/long/30.eon/test.py
index b9f0c2b51..d6bf3bb76 100644
--- a/tests/long/30.eon/test.py
+++ b/tests/long/30.eon/test.py
@@ -26,4 +26,10 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'eon chair.control.cook chair.camera chair.surfaces chair.cook.ppm ppm pixels_out.cook',executable = binpath('eon'))
+process = LiveProcess(executable = binpath('eon'))
+process.cmd = 'eon' + inputpath('eon', 'chair.control.cook') + \
+              inputpath('eon', 'chair.camera') + \
+              inputpath('eon', 'chair.surfaces') + \
+              inputpath('eon', 'chair.cook.ppm') + 'ppm' \
+              + 'pixels_out.cook'
+root.system.cpu.workload = process
diff --git a/tests/long/40.perlbmk/test.py b/tests/long/40.perlbmk/test.py
index b5cd17251..81c36bab3 100644
--- a/tests/long/40.perlbmk/test.py
+++ b/tests/long/40.perlbmk/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'perlbmk -I./lib lgred.makerand.pl',
-                                       executable = binpath('perlbmk'))
+process = LiveProcess(executable = binpath('perlbmk'))
+process.cmd = 'perlbmk -I./lib' + inputpath('perlbmk', 'lgred.makerand.pl')
+root.system.cpu.workload = process
diff --git a/tests/long/50.vortex/test.py b/tests/long/50.vortex/test.py
index f531b8ac8..f6d1e03df 100644
--- a/tests/long/50.vortex/test.py
+++ b/tests/long/50.vortex/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'vortex smred.raw',
-                                       executable = binpath('vortex'))
+process = LiveProcess(executable = binpath('vortex'))
+process.cmd = 'vortex' + inputpath('smred.raw')
+root.system.cpu.workload = process
diff --git a/tests/long/60.bzip2/test.py b/tests/long/60.bzip2/test.py
index 3f16efa09..e96d64656 100644
--- a/tests/long/60.bzip2/test.py
+++ b/tests/long/60.bzip2/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'bzip2 lgred.source',
-                                       executable = binpath('bzip2'))
+process = LiveProcess(executable = binpath('bzip2'))
+process.cmd = cmd = 'bzip2' + inputpath('bzip2', 'lgred.source')
+root.system.cpu.workload = process
diff --git a/tests/long/70.twolf/test.py b/tests/long/70.twolf/test.py
index 4ec7a3d03..be7a04f97 100644
--- a/tests/long/70.twolf/test.py
+++ b/tests/long/70.twolf/test.py
@@ -26,5 +26,6 @@
 #
 # Authors: Korey Sewell
 
-root.system.cpu.workload = LiveProcess(cmd = 'twolf smred/smred',
-                                       executable = binpath('twolf'))
+process = LiveProcess(executable = binpath('twolf'))
+process.cmd = 'twolf' + inputpath('twolf', 'smred/smred')
+root.system.cpu.workload = process
diff --git a/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt b/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt
index 0426166d9..d34c19255 100644
--- a/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt
+++ b/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt
@@ -1,40 +1,40 @@
 
 ---------- Begin Simulation Statistics ----------
 global.BPredUnit.BTBCorrect                         0                       # Number of correct BTB predictions (this stat may not work properly.
-global.BPredUnit.BTBHits                          682                       # Number of BTB hits
-global.BPredUnit.BTBLookups                      2437                       # Number of BTB lookups
+global.BPredUnit.BTBHits                          675                       # Number of BTB hits
+global.BPredUnit.BTBLookups                      2343                       # Number of BTB lookups
 global.BPredUnit.RASInCorrect                      76                       # Number of incorrect RAS predictions.
-global.BPredUnit.condIncorrect                    443                       # Number of conditional branches incorrect
-global.BPredUnit.condPredicted                   1570                       # Number of conditional branches predicted
-global.BPredUnit.lookups                         5322                       # Number of BP lookups
-global.BPredUnit.usedRAS                         2820                       # Number of times the RAS was used to get a target.
-host_inst_rate                                   9098                       # Simulator instruction rate (inst/s)
-host_mem_usage                                 180112                       # Number of bytes of host memory used
-host_seconds                                     0.62                       # Real time elapsed on the host
-host_tick_rate                                2277354                       # Simulator tick rate (ticks/s)
-memdepunit.memDep.conflictingLoads                 27                       # Number of conflicting loads.
-memdepunit.memDep.conflictingStores               144                       # Number of conflicting stores.
-memdepunit.memDep.insertedLoads                  3819                       # Number of loads inserted to the mem dependence unit.
-memdepunit.memDep.insertedStores                 3727                       # Number of stores inserted to the mem dependence unit.
+global.BPredUnit.condIncorrect                    437                       # Number of conditional branches incorrect
+global.BPredUnit.condPredicted                   1563                       # Number of conditional branches predicted
+global.BPredUnit.lookups                         5229                       # Number of BP lookups
+global.BPredUnit.usedRAS                         2821                       # Number of times the RAS was used to get a target.
+host_inst_rate                                  15039                       # Simulator instruction rate (inst/s)
+host_mem_usage                                 180156                       # Number of bytes of host memory used
+host_seconds                                     0.37                       # Real time elapsed on the host
+host_tick_rate                                3741816                       # Simulator tick rate (ticks/s)
+memdepunit.memDep.conflictingLoads                 23                       # Number of conflicting loads.
+memdepunit.memDep.conflictingStores               117                       # Number of conflicting stores.
+memdepunit.memDep.insertedLoads                  3775                       # Number of loads inserted to the mem dependence unit.
+memdepunit.memDep.insertedStores                 3734                       # Number of stores inserted to the mem dependence unit.
 sim_freq                                 1000000000000                       # Frequency of simulated ticks
 sim_insts                                        5623                       # Number of instructions simulated
 sim_seconds                                  0.000001                       # Number of seconds simulated
-sim_ticks                                     1408131                       # Number of ticks simulated
+sim_ticks                                     1400135                       # Number of ticks simulated
 system.cpu.commit.COM:branches                    862                       # Number of branches committed
-system.cpu.commit.COM:bw_lim_events                94                       # number cycles where commit BW limit reached
+system.cpu.commit.COM:bw_lim_events                97                       # number cycles where commit BW limit reached
 system.cpu.commit.COM:bw_limited                    0                       # number of insts not committed due to BW limits
 system.cpu.commit.COM:committed_per_cycle.start_dist                     # Number of insts commited each cycle
-system.cpu.commit.COM:committed_per_cycle.samples        58722                      
+system.cpu.commit.COM:committed_per_cycle.samples        51243                      
 system.cpu.commit.COM:committed_per_cycle.min_value            0                      
-                               0        56096   9552.81%           
-                               1         1495    254.59%           
-                               2          457     77.82%           
-                               3          225     38.32%           
-                               4          133     22.65%           
-                               5           92     15.67%           
-                               6           98     16.69%           
-                               7           32      5.45%           
-                               8           94     16.01%           
+                               0        48519   9468.42%           
+                               1         1590    310.29%           
+                               2          483     94.26%           
+                               3          227     44.30%           
+                               4          131     25.56%           
+                               5          104     20.30%           
+                               6           61     11.90%           
+                               7           31      6.05%           
+                               8           97     18.93%           
 system.cpu.commit.COM:committed_per_cycle.max_value            8                      
 system.cpu.commit.COM:committed_per_cycle.end_dist
 
@@ -43,69 +43,69 @@ system.cpu.commit.COM:loads                       979                       # Nu
 system.cpu.commit.COM:membars                       0                       # Number of memory barriers committed
 system.cpu.commit.COM:refs                       1791                       # Number of memory references committed
 system.cpu.commit.COM:swp_count                     0                       # Number of s/w prefetches committed
-system.cpu.commit.branchMispredicts               374                       # The number of times a branch was mispredicted
+system.cpu.commit.branchMispredicts               368                       # The number of times a branch was mispredicted
 system.cpu.commit.commitCommittedInsts           5640                       # The number of committed instructions
 system.cpu.commit.commitNonSpecStalls              17                       # The number of times commit has been forced to stall to communicate backwards
-system.cpu.commit.commitSquashedInsts           13826                       # The number of squashed insts skipped by commit
+system.cpu.commit.commitSquashedInsts           13830                       # The number of squashed insts skipped by commit
 system.cpu.committedInsts                        5623                       # Number of Instructions Simulated
 system.cpu.committedInsts_total                  5623                       # Number of Instructions Simulated
-system.cpu.cpi                             250.423439                       # CPI: Cycles Per Instruction
-system.cpu.cpi_total                       250.423439                       # CPI: Total CPI of All Threads
-system.cpu.dcache.ReadReq_accesses               1597                       # number of ReadReq accesses(hits+misses)
-system.cpu.dcache.ReadReq_avg_miss_latency  6940.988166                       # average ReadReq miss latency
-system.cpu.dcache.ReadReq_avg_mshr_miss_latency  6843.030303                       # average ReadReq mshr miss latency
-system.cpu.dcache.ReadReq_hits                   1428                       # number of ReadReq hits
-system.cpu.dcache.ReadReq_miss_latency        1173027                       # number of ReadReq miss cycles
-system.cpu.dcache.ReadReq_miss_rate          0.105823                       # miss rate for ReadReq accesses
-system.cpu.dcache.ReadReq_misses                  169                       # number of ReadReq misses
-system.cpu.dcache.ReadReq_mshr_hits                70                       # number of ReadReq MSHR hits
-system.cpu.dcache.ReadReq_mshr_miss_latency       677460                       # number of ReadReq MSHR miss cycles
-system.cpu.dcache.ReadReq_mshr_miss_rate     0.061991                       # mshr miss rate for ReadReq accesses
+system.cpu.cpi                             249.001423                       # CPI: Cycles Per Instruction
+system.cpu.cpi_total                       249.001423                       # CPI: Total CPI of All Threads
+system.cpu.dcache.ReadReq_accesses               1600                       # number of ReadReq accesses(hits+misses)
+system.cpu.dcache.ReadReq_avg_miss_latency  6986.684848                       # average ReadReq miss latency
+system.cpu.dcache.ReadReq_avg_mshr_miss_latency  6882.626263                       # average ReadReq mshr miss latency
+system.cpu.dcache.ReadReq_hits                   1435                       # number of ReadReq hits
+system.cpu.dcache.ReadReq_miss_latency        1152803                       # number of ReadReq miss cycles
+system.cpu.dcache.ReadReq_miss_rate          0.103125                       # miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_misses                  165                       # number of ReadReq misses
+system.cpu.dcache.ReadReq_mshr_hits                66                       # number of ReadReq MSHR hits
+system.cpu.dcache.ReadReq_mshr_miss_latency       681380                       # number of ReadReq MSHR miss cycles
+system.cpu.dcache.ReadReq_mshr_miss_rate     0.061875                       # mshr miss rate for ReadReq accesses
 system.cpu.dcache.ReadReq_mshr_misses              99                       # number of ReadReq MSHR misses
 system.cpu.dcache.WriteReq_accesses               812                       # number of WriteReq accesses(hits+misses)
-system.cpu.dcache.WriteReq_avg_miss_latency  5305.074803                       # average WriteReq miss latency
-system.cpu.dcache.WriteReq_avg_mshr_miss_latency  5141.328767                       # average WriteReq mshr miss latency
+system.cpu.dcache.WriteReq_avg_miss_latency  5293.047244                       # average WriteReq miss latency
+system.cpu.dcache.WriteReq_avg_mshr_miss_latency  5141.082192                       # average WriteReq mshr miss latency
 system.cpu.dcache.WriteReq_hits                   558                       # number of WriteReq hits
-system.cpu.dcache.WriteReq_miss_latency       1347489                       # number of WriteReq miss cycles
+system.cpu.dcache.WriteReq_miss_latency       1344434                       # number of WriteReq miss cycles
 system.cpu.dcache.WriteReq_miss_rate         0.312808                       # miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_misses                 254                       # number of WriteReq misses
 system.cpu.dcache.WriteReq_mshr_hits              181                       # number of WriteReq MSHR hits
-system.cpu.dcache.WriteReq_mshr_miss_latency       375317                       # number of WriteReq MSHR miss cycles
+system.cpu.dcache.WriteReq_mshr_miss_latency       375299                       # number of WriteReq MSHR miss cycles
 system.cpu.dcache.WriteReq_mshr_miss_rate     0.089901                       # mshr miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_mshr_misses             73                       # number of WriteReq MSHR misses
 system.cpu.dcache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
-system.cpu.dcache.avg_blocked_cycles_no_targets  3389.604651                       # average number of cycles each access was blocked
-system.cpu.dcache.avg_refs                  11.546512                       # Average number of references to valid blocks.
+system.cpu.dcache.avg_blocked_cycles_no_targets  3366.651163                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_refs                  11.587209                       # Average number of references to valid blocks.
 system.cpu.dcache.blocked_no_mshrs                  0                       # number of cycles access was blocked
 system.cpu.dcache.blocked_no_targets               43                       # number of cycles access was blocked
 system.cpu.dcache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
-system.cpu.dcache.blocked_cycles_no_targets       145753                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_targets       144766                       # number of cycles access was blocked
 system.cpu.dcache.cache_copies                      0                       # number of cache copies performed
-system.cpu.dcache.demand_accesses                2409                       # number of demand (read+write) accesses
-system.cpu.dcache.demand_avg_miss_latency  5958.666667                       # average overall miss latency
-system.cpu.dcache.demand_avg_mshr_miss_latency  6120.796512                       # average overall mshr miss latency
-system.cpu.dcache.demand_hits                    1986                       # number of demand (read+write) hits
-system.cpu.dcache.demand_miss_latency         2520516                       # number of demand (read+write) miss cycles
-system.cpu.dcache.demand_miss_rate           0.175592                       # miss rate for demand accesses
-system.cpu.dcache.demand_misses                   423                       # number of demand (read+write) misses
-system.cpu.dcache.demand_mshr_hits                251                       # number of demand (read+write) MSHR hits
-system.cpu.dcache.demand_mshr_miss_latency      1052777                       # number of demand (read+write) MSHR miss cycles
-system.cpu.dcache.demand_mshr_miss_rate      0.071399                       # mshr miss rate for demand accesses
+system.cpu.dcache.demand_accesses                2412                       # number of demand (read+write) accesses
+system.cpu.dcache.demand_avg_miss_latency  5959.992840                       # average overall miss latency
+system.cpu.dcache.demand_avg_mshr_miss_latency  6143.482558                       # average overall mshr miss latency
+system.cpu.dcache.demand_hits                    1993                       # number of demand (read+write) hits
+system.cpu.dcache.demand_miss_latency         2497237                       # number of demand (read+write) miss cycles
+system.cpu.dcache.demand_miss_rate           0.173715                       # miss rate for demand accesses
+system.cpu.dcache.demand_misses                   419                       # number of demand (read+write) misses
+system.cpu.dcache.demand_mshr_hits                247                       # number of demand (read+write) MSHR hits
+system.cpu.dcache.demand_mshr_miss_latency      1056679                       # number of demand (read+write) MSHR miss cycles
+system.cpu.dcache.demand_mshr_miss_rate      0.071310                       # mshr miss rate for demand accesses
 system.cpu.dcache.demand_mshr_misses              172                       # number of demand (read+write) MSHR misses
 system.cpu.dcache.fast_writes                       0                       # number of fast writes performed
 system.cpu.dcache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.dcache.no_allocate_misses                0                       # Number of misses that were no-allocate
-system.cpu.dcache.overall_accesses               2409                       # number of overall (read+write) accesses
-system.cpu.dcache.overall_avg_miss_latency  5958.666667                       # average overall miss latency
-system.cpu.dcache.overall_avg_mshr_miss_latency  6120.796512                       # average overall mshr miss latency
+system.cpu.dcache.overall_accesses               2412                       # number of overall (read+write) accesses
+system.cpu.dcache.overall_avg_miss_latency  5959.992840                       # average overall miss latency
+system.cpu.dcache.overall_avg_mshr_miss_latency  6143.482558                       # average overall mshr miss latency
 system.cpu.dcache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
-system.cpu.dcache.overall_hits                   1986                       # number of overall hits
-system.cpu.dcache.overall_miss_latency        2520516                       # number of overall miss cycles
-system.cpu.dcache.overall_miss_rate          0.175592                       # miss rate for overall accesses
-system.cpu.dcache.overall_misses                  423                       # number of overall misses
-system.cpu.dcache.overall_mshr_hits               251                       # number of overall MSHR hits
-system.cpu.dcache.overall_mshr_miss_latency      1052777                       # number of overall MSHR miss cycles
-system.cpu.dcache.overall_mshr_miss_rate     0.071399                       # mshr miss rate for overall accesses
+system.cpu.dcache.overall_hits                   1993                       # number of overall hits
+system.cpu.dcache.overall_miss_latency        2497237                       # number of overall miss cycles
+system.cpu.dcache.overall_miss_rate          0.173715                       # miss rate for overall accesses
+system.cpu.dcache.overall_misses                  419                       # number of overall misses
+system.cpu.dcache.overall_mshr_hits               247                       # number of overall MSHR hits
+system.cpu.dcache.overall_mshr_miss_latency      1056679                       # number of overall MSHR miss cycles
+system.cpu.dcache.overall_mshr_miss_rate     0.071310                       # mshr miss rate for overall accesses
 system.cpu.dcache.overall_mshr_misses             172                       # number of overall MSHR misses
 system.cpu.dcache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.dcache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
@@ -121,89 +121,89 @@ system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.dcache.replacements                      0                       # number of replacements
 system.cpu.dcache.sampled_refs                    172                       # Sample count of references to valid blocks.
 system.cpu.dcache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.dcache.tagsinuse                101.103948                       # Cycle average of tags in use
-system.cpu.dcache.total_refs                     1986                       # Total number of references to valid blocks.
+system.cpu.dcache.tagsinuse                101.349720                       # Cycle average of tags in use
+system.cpu.dcache.total_refs                     1993                       # Total number of references to valid blocks.
 system.cpu.dcache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.dcache.writebacks                        0                       # number of writebacks
-system.cpu.decode.DECODE:BlockedCycles          16535                       # Number of cycles decode is blocked
+system.cpu.decode.DECODE:BlockedCycles          17501                       # Number of cycles decode is blocked
 system.cpu.decode.DECODE:BranchMispred             70                       # Number of times decode detected a branch misprediction
-system.cpu.decode.DECODE:BranchResolved           167                       # Number of times decode resolved a branch
-system.cpu.decode.DECODE:DecodedInsts           29787                       # Number of instructions handled by decode
-system.cpu.decode.DECODE:IdleCycles             36497                       # Number of cycles decode is idle
-system.cpu.decode.DECODE:RunCycles               5653                       # Number of cycles decode is running
-system.cpu.decode.DECODE:SquashCycles            2641                       # Number of cycles decode is squashing
+system.cpu.decode.DECODE:BranchResolved           168                       # Number of times decode resolved a branch
+system.cpu.decode.DECODE:DecodedInsts           29666                       # Number of instructions handled by decode
+system.cpu.decode.DECODE:IdleCycles             28130                       # Number of cycles decode is idle
+system.cpu.decode.DECODE:RunCycles               5553                       # Number of cycles decode is running
+system.cpu.decode.DECODE:SquashCycles            2529                       # Number of cycles decode is squashing
 system.cpu.decode.DECODE:SquashedInsts            200                       # Number of squashed instructions handled by decode
-system.cpu.decode.DECODE:UnblockCycles             38                       # Number of cycles decode is unblocking
-system.cpu.fetch.Branches                        5322                       # Number of branches that fetch encountered
-system.cpu.fetch.CacheLines                      6542                       # Number of cache lines fetched
-system.cpu.fetch.Cycles                         21461                       # Number of cycles fetch has run and was not squashing or blocked
-system.cpu.fetch.IcacheSquashes                   388                       # Number of outstanding Icache misses that were squashed
-system.cpu.fetch.Insts                          35708                       # Number of instructions fetch has processed
-system.cpu.fetch.SquashCycles                    2149                       # Number of cycles fetch has spent squashing
-system.cpu.fetch.branchRate                  0.086728                       # Number of branch fetches per cycle
-system.cpu.fetch.icacheStallCycles               6542                       # Number of cycles fetch is stalled on an Icache miss
-system.cpu.fetch.predictedBranches               3502                       # Number of branches that fetch has predicted taken
-system.cpu.fetch.rate                        0.581905                       # Number of inst fetches per cycle
+system.cpu.decode.DECODE:UnblockCycles             60                       # Number of cycles decode is unblocking
+system.cpu.fetch.Branches                        5229                       # Number of branches that fetch encountered
+system.cpu.fetch.CacheLines                      6371                       # Number of cache lines fetched
+system.cpu.fetch.Cycles                         13322                       # Number of cycles fetch has run and was not squashing or blocked
+system.cpu.fetch.IcacheSquashes                   296                       # Number of outstanding Icache misses that were squashed
+system.cpu.fetch.Insts                          35572                       # Number of instructions fetch has processed
+system.cpu.fetch.SquashCycles                    2057                       # Number of cycles fetch has spent squashing
+system.cpu.fetch.branchRate                  0.097242                       # Number of branch fetches per cycle
+system.cpu.fetch.icacheStallCycles               6371                       # Number of cycles fetch is stalled on an Icache miss
+system.cpu.fetch.predictedBranches               3496                       # Number of branches that fetch has predicted taken
+system.cpu.fetch.rate                        0.661522                       # Number of inst fetches per cycle
 system.cpu.fetch.rateDist.start_dist                           # Number of instructions fetched each cycle (Total)
-system.cpu.fetch.rateDist.samples               61364                      
+system.cpu.fetch.rateDist.samples               53773                      
 system.cpu.fetch.rateDist.min_value                 0                      
-                               0        54337   8854.87%           
-                               1          197     32.10%           
-                               2          585     95.33%           
-                               3         1433    233.52%           
-                               4         1461    238.09%           
-                               5          241     39.27%           
-                               6          330     53.78%           
-                               7         1227    199.95%           
-                               8         1553    253.08%           
+                               0        46825   8707.90%           
+                               1          199     37.01%           
+                               2          504     93.73%           
+                               3         1429    265.75%           
+                               4         1462    271.88%           
+                               5          245     45.56%           
+                               6          322     59.88%           
+                               7         1223    227.44%           
+                               8         1564    290.85%           
 system.cpu.fetch.rateDist.max_value                 8                      
 system.cpu.fetch.rateDist.end_dist
 
-system.cpu.icache.ReadReq_accesses               6541                       # number of ReadReq accesses(hits+misses)
-system.cpu.icache.ReadReq_avg_miss_latency  5110.042601                       # average ReadReq miss latency
-system.cpu.icache.ReadReq_avg_mshr_miss_latency  4297.762058                       # average ReadReq mshr miss latency
-system.cpu.icache.ReadReq_hits                   6095                       # number of ReadReq hits
-system.cpu.icache.ReadReq_miss_latency        2279079                       # number of ReadReq miss cycles
-system.cpu.icache.ReadReq_miss_rate          0.068185                       # miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_accesses               6370                       # number of ReadReq accesses(hits+misses)
+system.cpu.icache.ReadReq_avg_miss_latency  5088.614350                       # average ReadReq miss latency
+system.cpu.icache.ReadReq_avg_mshr_miss_latency  4278.032258                       # average ReadReq mshr miss latency
+system.cpu.icache.ReadReq_hits                   5924                       # number of ReadReq hits
+system.cpu.icache.ReadReq_miss_latency        2269522                       # number of ReadReq miss cycles
+system.cpu.icache.ReadReq_miss_rate          0.070016                       # miss rate for ReadReq accesses
 system.cpu.icache.ReadReq_misses                  446                       # number of ReadReq misses
-system.cpu.icache.ReadReq_mshr_hits               135                       # number of ReadReq MSHR hits
-system.cpu.icache.ReadReq_mshr_miss_latency      1336604                       # number of ReadReq MSHR miss cycles
-system.cpu.icache.ReadReq_mshr_miss_rate     0.047546                       # mshr miss rate for ReadReq accesses
-system.cpu.icache.ReadReq_mshr_misses             311                       # number of ReadReq MSHR misses
+system.cpu.icache.ReadReq_mshr_hits               136                       # number of ReadReq MSHR hits
+system.cpu.icache.ReadReq_mshr_miss_latency      1326190                       # number of ReadReq MSHR miss cycles
+system.cpu.icache.ReadReq_mshr_miss_rate     0.048666                       # mshr miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_mshr_misses             310                       # number of ReadReq MSHR misses
 system.cpu.icache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
-system.cpu.icache.avg_blocked_cycles_no_targets  3658.571429                       # average number of cycles each access was blocked
-system.cpu.icache.avg_refs                  19.598071                       # Average number of references to valid blocks.
+system.cpu.icache.avg_blocked_cycles_no_targets  3444.375000                       # average number of cycles each access was blocked
+system.cpu.icache.avg_refs                  19.109677                       # Average number of references to valid blocks.
 system.cpu.icache.blocked_no_mshrs                  0                       # number of cycles access was blocked
-system.cpu.icache.blocked_no_targets                7                       # number of cycles access was blocked
+system.cpu.icache.blocked_no_targets                8                       # number of cycles access was blocked
 system.cpu.icache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
-system.cpu.icache.blocked_cycles_no_targets        25610                       # number of cycles access was blocked
+system.cpu.icache.blocked_cycles_no_targets        27555                       # number of cycles access was blocked
 system.cpu.icache.cache_copies                      0                       # number of cache copies performed
-system.cpu.icache.demand_accesses                6541                       # number of demand (read+write) accesses
-system.cpu.icache.demand_avg_miss_latency  5110.042601                       # average overall miss latency
-system.cpu.icache.demand_avg_mshr_miss_latency  4297.762058                       # average overall mshr miss latency
-system.cpu.icache.demand_hits                    6095                       # number of demand (read+write) hits
-system.cpu.icache.demand_miss_latency         2279079                       # number of demand (read+write) miss cycles
-system.cpu.icache.demand_miss_rate           0.068185                       # miss rate for demand accesses
+system.cpu.icache.demand_accesses                6370                       # number of demand (read+write) accesses
+system.cpu.icache.demand_avg_miss_latency  5088.614350                       # average overall miss latency
+system.cpu.icache.demand_avg_mshr_miss_latency  4278.032258                       # average overall mshr miss latency
+system.cpu.icache.demand_hits                    5924                       # number of demand (read+write) hits
+system.cpu.icache.demand_miss_latency         2269522                       # number of demand (read+write) miss cycles
+system.cpu.icache.demand_miss_rate           0.070016                       # miss rate for demand accesses
 system.cpu.icache.demand_misses                   446                       # number of demand (read+write) misses
-system.cpu.icache.demand_mshr_hits                135                       # number of demand (read+write) MSHR hits
-system.cpu.icache.demand_mshr_miss_latency      1336604                       # number of demand (read+write) MSHR miss cycles
-system.cpu.icache.demand_mshr_miss_rate      0.047546                       # mshr miss rate for demand accesses
-system.cpu.icache.demand_mshr_misses              311                       # number of demand (read+write) MSHR misses
+system.cpu.icache.demand_mshr_hits                136                       # number of demand (read+write) MSHR hits
+system.cpu.icache.demand_mshr_miss_latency      1326190                       # number of demand (read+write) MSHR miss cycles
+system.cpu.icache.demand_mshr_miss_rate      0.048666                       # mshr miss rate for demand accesses
+system.cpu.icache.demand_mshr_misses              310                       # number of demand (read+write) MSHR misses
 system.cpu.icache.fast_writes                       0                       # number of fast writes performed
 system.cpu.icache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.icache.no_allocate_misses                0                       # Number of misses that were no-allocate
-system.cpu.icache.overall_accesses               6541                       # number of overall (read+write) accesses
-system.cpu.icache.overall_avg_miss_latency  5110.042601                       # average overall miss latency
-system.cpu.icache.overall_avg_mshr_miss_latency  4297.762058                       # average overall mshr miss latency
+system.cpu.icache.overall_accesses               6370                       # number of overall (read+write) accesses
+system.cpu.icache.overall_avg_miss_latency  5088.614350                       # average overall miss latency
+system.cpu.icache.overall_avg_mshr_miss_latency  4278.032258                       # average overall mshr miss latency
 system.cpu.icache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
-system.cpu.icache.overall_hits                   6095                       # number of overall hits
-system.cpu.icache.overall_miss_latency        2279079                       # number of overall miss cycles
-system.cpu.icache.overall_miss_rate          0.068185                       # miss rate for overall accesses
+system.cpu.icache.overall_hits                   5924                       # number of overall hits
+system.cpu.icache.overall_miss_latency        2269522                       # number of overall miss cycles
+system.cpu.icache.overall_miss_rate          0.070016                       # miss rate for overall accesses
 system.cpu.icache.overall_misses                  446                       # number of overall misses
-system.cpu.icache.overall_mshr_hits               135                       # number of overall MSHR hits
-system.cpu.icache.overall_mshr_miss_latency      1336604                       # number of overall MSHR miss cycles
-system.cpu.icache.overall_mshr_miss_rate     0.047546                       # mshr miss rate for overall accesses
-system.cpu.icache.overall_mshr_misses             311                       # number of overall MSHR misses
+system.cpu.icache.overall_mshr_hits               136                       # number of overall MSHR hits
+system.cpu.icache.overall_mshr_miss_latency      1326190                       # number of overall MSHR miss cycles
+system.cpu.icache.overall_mshr_miss_rate     0.048666                       # mshr miss rate for overall accesses
+system.cpu.icache.overall_mshr_misses             310                       # number of overall MSHR misses
 system.cpu.icache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.icache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
 system.cpu.icache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
@@ -216,61 +216,61 @@ system.cpu.icache.prefetcher.num_hwpf_removed_MSHR_hit            0
 system.cpu.icache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
 system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
 system.cpu.icache.replacements                      0                       # number of replacements
-system.cpu.icache.sampled_refs                    311                       # Sample count of references to valid blocks.
+system.cpu.icache.sampled_refs                    310                       # Sample count of references to valid blocks.
 system.cpu.icache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.icache.tagsinuse                147.733346                       # Cycle average of tags in use
-system.cpu.icache.total_refs                     6095                       # Total number of references to valid blocks.
+system.cpu.icache.tagsinuse                147.070827                       # Cycle average of tags in use
+system.cpu.icache.total_refs                     5924                       # Total number of references to valid blocks.
 system.cpu.icache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.icache.writebacks                        0                       # number of writebacks
-system.cpu.idleCycles                         1346768                       # Total number of cycles that the CPU has spent unscheduled due to idling
-system.cpu.iew.EXEC:branches                     2391                       # Number of branches executed
-system.cpu.iew.EXEC:nop                            45                       # number of nop insts executed
-system.cpu.iew.EXEC:rate                     0.222997                       # Inst execution rate
-system.cpu.iew.EXEC:refs                         5561                       # number of memory reference insts executed
-system.cpu.iew.EXEC:stores                       2148                       # Number of stores executed
+system.cpu.idleCycles                         1346363                       # Total number of cycles that the CPU has spent unscheduled due to idling
+system.cpu.iew.EXEC:branches                     2364                       # Number of branches executed
+system.cpu.iew.EXEC:nop                            48                       # number of nop insts executed
+system.cpu.iew.EXEC:rate                     0.251650                       # Inst execution rate
+system.cpu.iew.EXEC:refs                         5460                       # number of memory reference insts executed
+system.cpu.iew.EXEC:stores                       2123                       # Number of stores executed
 system.cpu.iew.EXEC:swp                             0                       # number of swp insts executed
-system.cpu.iew.WB:consumers                      6673                       # num instructions consuming a value
-system.cpu.iew.WB:count                         11743                       # cumulative count of insts written-back
-system.cpu.iew.WB:fanout                     0.790499                       # average fanout of values written-back
+system.cpu.iew.WB:consumers                      6466                       # num instructions consuming a value
+system.cpu.iew.WB:count                         11620                       # cumulative count of insts written-back
+system.cpu.iew.WB:fanout                     0.798639                       # average fanout of values written-back
 system.cpu.iew.WB:penalized                         0                       # number of instrctions required to write to 'other' IQ
 system.cpu.iew.WB:penalized_rate                    0                       # fraction of instructions written-back that wrote to 'other' IQ
-system.cpu.iew.WB:producers                      5275                       # num instructions producing a value
-system.cpu.iew.WB:rate                       0.191366                       # insts written-back per cycle
-system.cpu.iew.WB:sent                          11811                       # cumulative count of insts sent to commit
-system.cpu.iew.branchMispredicts                  404                       # Number of branch mispredicts detected at execute
-system.cpu.iew.iewBlockCycles                    6301                       # Number of cycles IEW is blocking
-system.cpu.iew.iewDispLoadInsts                  3819                       # Number of dispatched load instructions
-system.cpu.iew.iewDispNonSpecInsts                 23                       # Number of dispatched non-speculative instructions
-system.cpu.iew.iewDispSquashedInsts              2540                       # Number of squashed instructions skipped by dispatch
-system.cpu.iew.iewDispStoreInsts                 3727                       # Number of dispatched store instructions
-system.cpu.iew.iewDispatchedInsts               19466                       # Number of instructions dispatched to IQ
-system.cpu.iew.iewExecLoadInsts                  3413                       # Number of load instructions executed
-system.cpu.iew.iewExecSquashedInsts               276                       # Number of squashed instructions skipped in execute
-system.cpu.iew.iewExecutedInsts                 13684                       # Number of executed instructions
-system.cpu.iew.iewIQFullEvents                      5                       # Number of times the IQ has become full, causing a stall
+system.cpu.iew.WB:producers                      5164                       # num instructions producing a value
+system.cpu.iew.WB:rate                       0.216094                       # insts written-back per cycle
+system.cpu.iew.WB:sent                          11692                       # cumulative count of insts sent to commit
+system.cpu.iew.branchMispredicts                  401                       # Number of branch mispredicts detected at execute
+system.cpu.iew.iewBlockCycles                    7230                       # Number of cycles IEW is blocking
+system.cpu.iew.iewDispLoadInsts                  3775                       # Number of dispatched load instructions
+system.cpu.iew.iewDispNonSpecInsts                 24                       # Number of dispatched non-speculative instructions
+system.cpu.iew.iewDispSquashedInsts              2557                       # Number of squashed instructions skipped by dispatch
+system.cpu.iew.iewDispStoreInsts                 3734                       # Number of dispatched store instructions
+system.cpu.iew.iewDispatchedInsts               19465                       # Number of instructions dispatched to IQ
+system.cpu.iew.iewExecLoadInsts                  3337                       # Number of load instructions executed
+system.cpu.iew.iewExecSquashedInsts               308                       # Number of squashed instructions skipped in execute
+system.cpu.iew.iewExecutedInsts                 13532                       # Number of executed instructions
+system.cpu.iew.iewIQFullEvents                     10                       # Number of times the IQ has become full, causing a stall
 system.cpu.iew.iewIdleCycles                        0                       # Number of cycles IEW is idle
 system.cpu.iew.iewLSQFullEvents                     1                       # Number of times the LSQ has become full, causing a stall
-system.cpu.iew.iewSquashCycles                   2641                       # Number of cycles IEW is squashing
-system.cpu.iew.iewUnblockCycles                    34                       # Number of cycles IEW is unblocking
+system.cpu.iew.iewSquashCycles                   2529                       # Number of cycles IEW is squashing
+system.cpu.iew.iewUnblockCycles                    39                       # Number of cycles IEW is unblocking
 system.cpu.iew.lsq.thread.0.blockedLoads            1                       # Number of blocked loads due to partial load-store forwarding
-system.cpu.iew.lsq.thread.0.cacheBlocked         1736                       # Number of times an access to memory failed due to the cache being blocked
+system.cpu.iew.lsq.thread.0.cacheBlocked         1656                       # Number of times an access to memory failed due to the cache being blocked
 system.cpu.iew.lsq.thread.0.forwLoads              81                       # Number of loads that had data forwarded from stores
 system.cpu.iew.lsq.thread.0.ignoredResponses            3                       # Number of memory responses ignored because the instruction is squashed
 system.cpu.iew.lsq.thread.0.invAddrLoads            0                       # Number of loads ignored due to an invalid address
 system.cpu.iew.lsq.thread.0.invAddrSwpfs            0                       # Number of software prefetches ignored due to an invalid address
-system.cpu.iew.lsq.thread.0.memOrderViolation           45                       # Number of memory ordering violations
+system.cpu.iew.lsq.thread.0.memOrderViolation           40                       # Number of memory ordering violations
 system.cpu.iew.lsq.thread.0.rescheduledLoads            1                       # Number of loads that were rescheduled
-system.cpu.iew.lsq.thread.0.squashedLoads         2840                       # Number of loads squashed
-system.cpu.iew.lsq.thread.0.squashedStores         2915                       # Number of stores squashed
-system.cpu.iew.memOrderViolationEvents             45                       # Number of memory order violations
-system.cpu.iew.predictedNotTakenIncorrect          283                       # Number of branches that were predicted not taken incorrectly
-system.cpu.iew.predictedTakenIncorrect            121                       # Number of branches that were predicted taken incorrectly
-system.cpu.ipc                               0.003993                       # IPC: Instructions Per Cycle
-system.cpu.ipc_total                         0.003993                       # IPC: Total IPC of All Threads
-system.cpu.iq.ISSUE:FU_type_0                   13960                       # Type of FU issued
+system.cpu.iew.lsq.thread.0.squashedLoads         2796                       # Number of loads squashed
+system.cpu.iew.lsq.thread.0.squashedStores         2922                       # Number of stores squashed
+system.cpu.iew.memOrderViolationEvents             40                       # Number of memory order violations
+system.cpu.iew.predictedNotTakenIncorrect          281                       # Number of branches that were predicted not taken incorrectly
+system.cpu.iew.predictedTakenIncorrect            120                       # Number of branches that were predicted taken incorrectly
+system.cpu.ipc                               0.004016                       # IPC: Instructions Per Cycle
+system.cpu.ipc_total                         0.004016                       # IPC: Total IPC of All Threads
+system.cpu.iq.ISSUE:FU_type_0                   13840                       # Type of FU issued
 system.cpu.iq.ISSUE:FU_type_0.start_dist
                           (null)            2      0.01%            # Type of FU issued
-                          IntAlu         8277     59.29%            # Type of FU issued
+                          IntAlu         8249     59.60%            # Type of FU issued
                          IntMult            1      0.01%            # Type of FU issued
                           IntDiv            0      0.00%            # Type of FU issued
                         FloatAdd            2      0.01%            # Type of FU issued
@@ -279,16 +279,16 @@ system.cpu.iq.ISSUE:FU_type_0.start_dist
                        FloatMult            0      0.00%            # Type of FU issued
                         FloatDiv            0      0.00%            # Type of FU issued
                        FloatSqrt            0      0.00%            # Type of FU issued
-                         MemRead         3509     25.14%            # Type of FU issued
-                        MemWrite         2169     15.54%            # Type of FU issued
+                         MemRead         3432     24.80%            # Type of FU issued
+                        MemWrite         2154     15.56%            # Type of FU issued
                        IprAccess            0      0.00%            # Type of FU issued
                     InstPrefetch            0      0.00%            # Type of FU issued
 system.cpu.iq.ISSUE:FU_type_0.end_dist
-system.cpu.iq.ISSUE:fu_busy_cnt                    93                       # FU busy when requested
-system.cpu.iq.ISSUE:fu_busy_rate             0.006662                       # FU busy rate (busy events/executed inst)
+system.cpu.iq.ISSUE:fu_busy_cnt                    86                       # FU busy when requested
+system.cpu.iq.ISSUE:fu_busy_rate             0.006214                       # FU busy rate (busy events/executed inst)
 system.cpu.iq.ISSUE:fu_full.start_dist
                           (null)            0      0.00%            # attempts to use FU when none available
-                          IntAlu            3      3.23%            # attempts to use FU when none available
+                          IntAlu            1      1.16%            # attempts to use FU when none available
                          IntMult            0      0.00%            # attempts to use FU when none available
                           IntDiv            0      0.00%            # attempts to use FU when none available
                         FloatAdd            0      0.00%            # attempts to use FU when none available
@@ -297,78 +297,78 @@ system.cpu.iq.ISSUE:fu_full.start_dist
                        FloatMult            0      0.00%            # attempts to use FU when none available
                         FloatDiv            0      0.00%            # attempts to use FU when none available
                        FloatSqrt            0      0.00%            # attempts to use FU when none available
-                         MemRead           54     58.06%            # attempts to use FU when none available
-                        MemWrite           36     38.71%            # attempts to use FU when none available
+                         MemRead           53     61.63%            # attempts to use FU when none available
+                        MemWrite           32     37.21%            # attempts to use FU when none available
                        IprAccess            0      0.00%            # attempts to use FU when none available
                     InstPrefetch            0      0.00%            # attempts to use FU when none available
 system.cpu.iq.ISSUE:fu_full.end_dist
 system.cpu.iq.ISSUE:issued_per_cycle.start_dist                     # Number of insts issued each cycle
-system.cpu.iq.ISSUE:issued_per_cycle.samples        61364                      
+system.cpu.iq.ISSUE:issued_per_cycle.samples        53773                      
 system.cpu.iq.ISSUE:issued_per_cycle.min_value            0                      
-                               0        54449   8873.12%           
-                               1         3310    539.40%           
-                               2         1268    206.64%           
-                               3         1704    277.69%           
-                               4          325     52.96%           
-                               5          194     31.61%           
-                               6           79     12.87%           
-                               7           22      3.59%           
-                               8           13      2.12%           
+                               0        46903   8722.41%           
+                               1         3262    606.62%           
+                               2         1316    244.73%           
+                               3         1665    309.63%           
+                               4          333     61.93%           
+                               5          188     34.96%           
+                               6           73     13.58%           
+                               7           23      4.28%           
+                               8           10      1.86%           
 system.cpu.iq.ISSUE:issued_per_cycle.max_value            8                      
 system.cpu.iq.ISSUE:issued_per_cycle.end_dist
 
-system.cpu.iq.ISSUE:rate                     0.227495                       # Inst issue rate
-system.cpu.iq.iqInstsAdded                      19398                       # Number of instructions added to the IQ (excludes non-spec)
-system.cpu.iq.iqInstsIssued                     13960                       # Number of instructions issued
-system.cpu.iq.iqNonSpecInstsAdded                  23                       # Number of non-speculative instructions added to the IQ
-system.cpu.iq.iqSquashedInstsExamined           13240                       # Number of squashed instructions iterated over during squash; mainly for profiling
-system.cpu.iq.iqSquashedInstsIssued                66                       # Number of squashed instructions issued
-system.cpu.iq.iqSquashedNonSpecRemoved              6                       # Number of squashed non-spec instructions that were removed
-system.cpu.iq.iqSquashedOperandsExamined         9412                       # Number of squashed operands that are examined and possibly removed from graph
-system.cpu.l2cache.ReadReq_accesses               483                       # number of ReadReq accesses(hits+misses)
-system.cpu.l2cache.ReadReq_avg_miss_latency  4537.301455                       # average ReadReq miss latency
-system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2307.006237                       # average ReadReq mshr miss latency
+system.cpu.iq.ISSUE:rate                     0.257378                       # Inst issue rate
+system.cpu.iq.iqInstsAdded                      19393                       # Number of instructions added to the IQ (excludes non-spec)
+system.cpu.iq.iqInstsIssued                     13840                       # Number of instructions issued
+system.cpu.iq.iqNonSpecInstsAdded                  24                       # Number of non-speculative instructions added to the IQ
+system.cpu.iq.iqSquashedInstsExamined           13381                       # Number of squashed instructions iterated over during squash; mainly for profiling
+system.cpu.iq.iqSquashedInstsIssued                72                       # Number of squashed instructions issued
+system.cpu.iq.iqSquashedNonSpecRemoved              7                       # Number of squashed non-spec instructions that were removed
+system.cpu.iq.iqSquashedOperandsExamined         9575                       # Number of squashed operands that are examined and possibly removed from graph
+system.cpu.l2cache.ReadReq_accesses               482                       # number of ReadReq accesses(hits+misses)
+system.cpu.l2cache.ReadReq_avg_miss_latency  4520.691667                       # average ReadReq miss latency
+system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2303.372917                       # average ReadReq mshr miss latency
 system.cpu.l2cache.ReadReq_hits                     2                       # number of ReadReq hits
-system.cpu.l2cache.ReadReq_miss_latency       2182442                       # number of ReadReq miss cycles
-system.cpu.l2cache.ReadReq_miss_rate         0.995859                       # miss rate for ReadReq accesses
-system.cpu.l2cache.ReadReq_misses                 481                       # number of ReadReq misses
-system.cpu.l2cache.ReadReq_mshr_miss_latency      1109670                       # number of ReadReq MSHR miss cycles
-system.cpu.l2cache.ReadReq_mshr_miss_rate     0.995859                       # mshr miss rate for ReadReq accesses
-system.cpu.l2cache.ReadReq_mshr_misses            481                       # number of ReadReq MSHR misses
+system.cpu.l2cache.ReadReq_miss_latency       2169932                       # number of ReadReq miss cycles
+system.cpu.l2cache.ReadReq_miss_rate         0.995851                       # miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_misses                 480                       # number of ReadReq misses
+system.cpu.l2cache.ReadReq_mshr_miss_latency      1105619                       # number of ReadReq MSHR miss cycles
+system.cpu.l2cache.ReadReq_mshr_miss_rate     0.995851                       # mshr miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_mshr_misses            480                       # number of ReadReq MSHR misses
 system.cpu.l2cache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
 system.cpu.l2cache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
-system.cpu.l2cache.avg_refs                  0.004158                       # Average number of references to valid blocks.
+system.cpu.l2cache.avg_refs                  0.004167                       # Average number of references to valid blocks.
 system.cpu.l2cache.blocked_no_mshrs                 0                       # number of cycles access was blocked
 system.cpu.l2cache.blocked_no_targets               0                       # number of cycles access was blocked
 system.cpu.l2cache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
 system.cpu.l2cache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
 system.cpu.l2cache.cache_copies                     0                       # number of cache copies performed
-system.cpu.l2cache.demand_accesses                483                       # number of demand (read+write) accesses
-system.cpu.l2cache.demand_avg_miss_latency  4537.301455                       # average overall miss latency
-system.cpu.l2cache.demand_avg_mshr_miss_latency  2307.006237                       # average overall mshr miss latency
+system.cpu.l2cache.demand_accesses                482                       # number of demand (read+write) accesses
+system.cpu.l2cache.demand_avg_miss_latency  4520.691667                       # average overall miss latency
+system.cpu.l2cache.demand_avg_mshr_miss_latency  2303.372917                       # average overall mshr miss latency
 system.cpu.l2cache.demand_hits                      2                       # number of demand (read+write) hits
-system.cpu.l2cache.demand_miss_latency        2182442                       # number of demand (read+write) miss cycles
-system.cpu.l2cache.demand_miss_rate          0.995859                       # miss rate for demand accesses
-system.cpu.l2cache.demand_misses                  481                       # number of demand (read+write) misses
+system.cpu.l2cache.demand_miss_latency        2169932                       # number of demand (read+write) miss cycles
+system.cpu.l2cache.demand_miss_rate          0.995851                       # miss rate for demand accesses
+system.cpu.l2cache.demand_misses                  480                       # number of demand (read+write) misses
 system.cpu.l2cache.demand_mshr_hits                 0                       # number of demand (read+write) MSHR hits
-system.cpu.l2cache.demand_mshr_miss_latency      1109670                       # number of demand (read+write) MSHR miss cycles
-system.cpu.l2cache.demand_mshr_miss_rate     0.995859                       # mshr miss rate for demand accesses
-system.cpu.l2cache.demand_mshr_misses             481                       # number of demand (read+write) MSHR misses
+system.cpu.l2cache.demand_mshr_miss_latency      1105619                       # number of demand (read+write) MSHR miss cycles
+system.cpu.l2cache.demand_mshr_miss_rate     0.995851                       # mshr miss rate for demand accesses
+system.cpu.l2cache.demand_mshr_misses             480                       # number of demand (read+write) MSHR misses
 system.cpu.l2cache.fast_writes                      0                       # number of fast writes performed
 system.cpu.l2cache.mshr_cap_events                  0                       # number of times MSHR cap was activated
 system.cpu.l2cache.no_allocate_misses               0                       # Number of misses that were no-allocate
-system.cpu.l2cache.overall_accesses               483                       # number of overall (read+write) accesses
-system.cpu.l2cache.overall_avg_miss_latency  4537.301455                       # average overall miss latency
-system.cpu.l2cache.overall_avg_mshr_miss_latency  2307.006237                       # average overall mshr miss latency
+system.cpu.l2cache.overall_accesses               482                       # number of overall (read+write) accesses
+system.cpu.l2cache.overall_avg_miss_latency  4520.691667                       # average overall miss latency
+system.cpu.l2cache.overall_avg_mshr_miss_latency  2303.372917                       # average overall mshr miss latency
 system.cpu.l2cache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
 system.cpu.l2cache.overall_hits                     2                       # number of overall hits
-system.cpu.l2cache.overall_miss_latency       2182442                       # number of overall miss cycles
-system.cpu.l2cache.overall_miss_rate         0.995859                       # miss rate for overall accesses
-system.cpu.l2cache.overall_misses                 481                       # number of overall misses
+system.cpu.l2cache.overall_miss_latency       2169932                       # number of overall miss cycles
+system.cpu.l2cache.overall_miss_rate         0.995851                       # miss rate for overall accesses
+system.cpu.l2cache.overall_misses                 480                       # number of overall misses
 system.cpu.l2cache.overall_mshr_hits                0                       # number of overall MSHR hits
-system.cpu.l2cache.overall_mshr_miss_latency      1109670                       # number of overall MSHR miss cycles
-system.cpu.l2cache.overall_mshr_miss_rate     0.995859                       # mshr miss rate for overall accesses
-system.cpu.l2cache.overall_mshr_misses            481                       # number of overall MSHR misses
+system.cpu.l2cache.overall_mshr_miss_latency      1105619                       # number of overall MSHR miss cycles
+system.cpu.l2cache.overall_mshr_miss_rate     0.995851                       # mshr miss rate for overall accesses
+system.cpu.l2cache.overall_mshr_misses            480                       # number of overall MSHR misses
 system.cpu.l2cache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.l2cache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
 system.cpu.l2cache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
@@ -381,29 +381,29 @@ system.cpu.l2cache.prefetcher.num_hwpf_removed_MSHR_hit            0
 system.cpu.l2cache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
 system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
 system.cpu.l2cache.replacements                     0                       # number of replacements
-system.cpu.l2cache.sampled_refs                   481                       # Sample count of references to valid blocks.
+system.cpu.l2cache.sampled_refs                   480                       # Sample count of references to valid blocks.
 system.cpu.l2cache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.l2cache.tagsinuse               248.876875                       # Cycle average of tags in use
+system.cpu.l2cache.tagsinuse               248.469634                       # Cycle average of tags in use
 system.cpu.l2cache.total_refs                       2                       # Total number of references to valid blocks.
 system.cpu.l2cache.warmup_cycle                     0                       # Cycle when the warmup percentage was hit.
 system.cpu.l2cache.writebacks                       0                       # number of writebacks
-system.cpu.numCycles                            61364                       # number of cpu cycles simulated
-system.cpu.rename.RENAME:BlockCycles             6939                       # Number of cycles rename is blocking
+system.cpu.numCycles                            53773                       # number of cpu cycles simulated
+system.cpu.rename.RENAME:BlockCycles             7860                       # Number of cycles rename is blocking
 system.cpu.rename.RENAME:CommittedMaps           4051                       # Number of HB maps that are committed
 system.cpu.rename.RENAME:IQFullEvents               2                       # Number of times rename has blocked due to IQ full
-system.cpu.rename.RENAME:IdleCycles             36651                       # Number of cycles rename is idle
-system.cpu.rename.RENAME:LSQFullEvents            412                       # Number of times rename has blocked due to LSQ full
-system.cpu.rename.RENAME:ROBFullEvents              9                       # Number of times rename has blocked due to ROB full
-system.cpu.rename.RENAME:RenameLookups          36093                       # Number of register rename lookups that rename has made
-system.cpu.rename.RENAME:RenamedInsts           29280                       # Number of instructions processed by rename
-system.cpu.rename.RENAME:RenamedOperands        20221                       # Number of destination operands rename has renamed
-system.cpu.rename.RENAME:RunCycles               5480                       # Number of cycles rename is running
-system.cpu.rename.RENAME:SquashCycles            2641                       # Number of cycles rename is squashing
-system.cpu.rename.RENAME:UnblockCycles            493                       # Number of cycles rename is unblocking
-system.cpu.rename.RENAME:UndoneMaps             16170                       # Number of HB maps that are undone due to squashing
-system.cpu.rename.RENAME:serializeStallCycles         9160                       # count of cycles rename stalled for serializing inst
+system.cpu.rename.RENAME:IdleCycles             28280                       # Number of cycles rename is idle
+system.cpu.rename.RENAME:LSQFullEvents            453                       # Number of times rename has blocked due to LSQ full
+system.cpu.rename.RENAME:ROBFullEvents              8                       # Number of times rename has blocked due to ROB full
+system.cpu.rename.RENAME:RenameLookups          36016                       # Number of register rename lookups that rename has made
+system.cpu.rename.RENAME:RenamedInsts           29203                       # Number of instructions processed by rename
+system.cpu.rename.RENAME:RenamedOperands        20142                       # Number of destination operands rename has renamed
+system.cpu.rename.RENAME:RunCycles               5460                       # Number of cycles rename is running
+system.cpu.rename.RENAME:SquashCycles            2529                       # Number of cycles rename is squashing
+system.cpu.rename.RENAME:UnblockCycles            483                       # Number of cycles rename is unblocking
+system.cpu.rename.RENAME:UndoneMaps             16091                       # Number of HB maps that are undone due to squashing
+system.cpu.rename.RENAME:serializeStallCycles         9161                       # count of cycles rename stalled for serializing inst
 system.cpu.rename.RENAME:serializingInsts           27                       # count of serializing insts renamed
-system.cpu.rename.RENAME:skidInsts                927                       # count of insts added to the skid buffer
+system.cpu.rename.RENAME:skidInsts                828                       # count of insts added to the skid buffer
 system.cpu.rename.RENAME:tempSerializingInsts           21                       # count of temporary serializing insts renamed
 system.cpu.timesIdled                             369                       # Number of times that the entire CPU went into an idle state and unscheduled itself
 system.cpu.workload.PROG:num_syscalls              17                       # Number of system calls
diff --git a/tests/quick/00.hello/ref/alpha/tru64/o3-timing/m5stats.txt b/tests/quick/00.hello/ref/alpha/tru64/o3-timing/m5stats.txt
index 44f155480..ce44cab28 100644
--- a/tests/quick/00.hello/ref/alpha/tru64/o3-timing/m5stats.txt
+++ b/tests/quick/00.hello/ref/alpha/tru64/o3-timing/m5stats.txt
@@ -2,39 +2,39 @@
 ---------- Begin Simulation Statistics ----------
 global.BPredUnit.BTBCorrect                         0                       # Number of correct BTB predictions (this stat may not work properly.
 global.BPredUnit.BTBHits                          200                       # Number of BTB hits
-global.BPredUnit.BTBLookups                       711                       # Number of BTB lookups
+global.BPredUnit.BTBLookups                       718                       # Number of BTB lookups
 global.BPredUnit.RASInCorrect                      42                       # Number of incorrect RAS predictions.
-global.BPredUnit.condIncorrect                    221                       # Number of conditional branches incorrect
-global.BPredUnit.condPredicted                    451                       # Number of conditional branches predicted
-global.BPredUnit.lookups                          891                       # Number of BP lookups
-global.BPredUnit.usedRAS                          172                       # Number of times the RAS was used to get a target.
-host_inst_rate                                  20134                       # Simulator instruction rate (inst/s)
-host_mem_usage                                 179640                       # Number of bytes of host memory used
+global.BPredUnit.condIncorrect                    218                       # Number of conditional branches incorrect
+global.BPredUnit.condPredicted                    459                       # Number of conditional branches predicted
+global.BPredUnit.lookups                          898                       # Number of BP lookups
+global.BPredUnit.usedRAS                          171                       # Number of times the RAS was used to get a target.
+host_inst_rate                                  19676                       # Simulator instruction rate (inst/s)
+host_mem_usage                                 179796                       # Number of bytes of host memory used
 host_seconds                                     0.12                       # Real time elapsed on the host
-host_tick_rate                                6326998                       # Simulator tick rate (ticks/s)
+host_tick_rate                                6183068                       # Simulator tick rate (ticks/s)
 memdepunit.memDep.conflictingLoads                 10                       # Number of conflicting loads.
 memdepunit.memDep.conflictingStores                 8                       # Number of conflicting stores.
-memdepunit.memDep.insertedLoads                   784                       # Number of loads inserted to the mem dependence unit.
-memdepunit.memDep.insertedStores                  376                       # Number of stores inserted to the mem dependence unit.
+memdepunit.memDep.insertedLoads                   783                       # Number of loads inserted to the mem dependence unit.
+memdepunit.memDep.insertedStores                  381                       # Number of stores inserted to the mem dependence unit.
 sim_freq                                 1000000000000                       # Frequency of simulated ticks
 sim_insts                                        2387                       # Number of instructions simulated
 sim_seconds                                  0.000001                       # Number of seconds simulated
-sim_ticks                                      752027                       # Number of ticks simulated
+sim_ticks                                      752028                       # Number of ticks simulated
 system.cpu.commit.COM:branches                    396                       # Number of branches committed
-system.cpu.commit.COM:bw_lim_events                56                       # number cycles where commit BW limit reached
+system.cpu.commit.COM:bw_lim_events                51                       # number cycles where commit BW limit reached
 system.cpu.commit.COM:bw_limited                    0                       # number of insts not committed due to BW limits
 system.cpu.commit.COM:committed_per_cycle.start_dist                     # Number of insts commited each cycle
-system.cpu.commit.COM:committed_per_cycle.samples        28113                      
+system.cpu.commit.COM:committed_per_cycle.samples        28200                      
 system.cpu.commit.COM:committed_per_cycle.min_value            0                      
-                               0        27203   9676.31%           
-                               1          230     81.81%           
-                               2          313    111.34%           
-                               3          133     47.31%           
-                               4           80     28.46%           
-                               5           53     18.85%           
-                               6           27      9.60%           
-                               7           18      6.40%           
-                               8           56     19.92%           
+                               0        27270   9670.21%           
+                               1          239     84.75%           
+                               2          332    117.73%           
+                               3          127     45.04%           
+                               4           83     29.43%           
+                               5           54     19.15%           
+                               6           26      9.22%           
+                               7           18      6.38%           
+                               8           51     18.09%           
 system.cpu.commit.COM:committed_per_cycle.max_value            8                      
 system.cpu.commit.COM:committed_per_cycle.end_dist
 
@@ -43,69 +43,69 @@ system.cpu.commit.COM:loads                       415                       # Nu
 system.cpu.commit.COM:membars                       0                       # Number of memory barriers committed
 system.cpu.commit.COM:refs                        709                       # Number of memory references committed
 system.cpu.commit.COM:swp_count                     0                       # Number of s/w prefetches committed
-system.cpu.commit.branchMispredicts               144                       # The number of times a branch was mispredicted
+system.cpu.commit.branchMispredicts               141                       # The number of times a branch was mispredicted
 system.cpu.commit.commitCommittedInsts           2576                       # The number of committed instructions
 system.cpu.commit.commitNonSpecStalls               4                       # The number of times commit has been forced to stall to communicate backwards
-system.cpu.commit.commitSquashedInsts            1694                       # The number of squashed insts skipped by commit
+system.cpu.commit.commitSquashedInsts            1703                       # The number of squashed insts skipped by commit
 system.cpu.committedInsts                        2387                       # Number of Instructions Simulated
 system.cpu.committedInsts_total                  2387                       # Number of Instructions Simulated
-system.cpu.cpi                             315.051110                       # CPI: Cycles Per Instruction
-system.cpu.cpi_total                       315.051110                       # CPI: Total CPI of All Threads
-system.cpu.dcache.ReadReq_accesses                562                       # number of ReadReq accesses(hits+misses)
-system.cpu.dcache.ReadReq_avg_miss_latency  7254.010870                       # average ReadReq miss latency
-system.cpu.dcache.ReadReq_avg_mshr_miss_latency  7288.590164                       # average ReadReq mshr miss latency
-system.cpu.dcache.ReadReq_hits                    470                       # number of ReadReq hits
-system.cpu.dcache.ReadReq_miss_latency         667369                       # number of ReadReq miss cycles
-system.cpu.dcache.ReadReq_miss_rate          0.163701                       # miss rate for ReadReq accesses
+system.cpu.cpi                             315.051529                       # CPI: Cycles Per Instruction
+system.cpu.cpi_total                       315.051529                       # CPI: Total CPI of All Threads
+system.cpu.dcache.ReadReq_accesses                560                       # number of ReadReq accesses(hits+misses)
+system.cpu.dcache.ReadReq_avg_miss_latency  7231.967391                       # average ReadReq miss latency
+system.cpu.dcache.ReadReq_avg_mshr_miss_latency  7288.377049                       # average ReadReq mshr miss latency
+system.cpu.dcache.ReadReq_hits                    468                       # number of ReadReq hits
+system.cpu.dcache.ReadReq_miss_latency         665341                       # number of ReadReq miss cycles
+system.cpu.dcache.ReadReq_miss_rate          0.164286                       # miss rate for ReadReq accesses
 system.cpu.dcache.ReadReq_misses                   92                       # number of ReadReq misses
 system.cpu.dcache.ReadReq_mshr_hits                31                       # number of ReadReq MSHR hits
-system.cpu.dcache.ReadReq_mshr_miss_latency       444604                       # number of ReadReq MSHR miss cycles
-system.cpu.dcache.ReadReq_mshr_miss_rate     0.108541                       # mshr miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_mshr_miss_latency       444591                       # number of ReadReq MSHR miss cycles
+system.cpu.dcache.ReadReq_mshr_miss_rate     0.108929                       # mshr miss rate for ReadReq accesses
 system.cpu.dcache.ReadReq_mshr_misses              61                       # number of ReadReq MSHR misses
 system.cpu.dcache.WriteReq_accesses               294                       # number of WriteReq accesses(hits+misses)
-system.cpu.dcache.WriteReq_avg_miss_latency  6647.600000                       # average WriteReq miss latency
-system.cpu.dcache.WriteReq_avg_mshr_miss_latency  6571.583333                       # average WriteReq mshr miss latency
+system.cpu.dcache.WriteReq_avg_miss_latency  6647.685714                       # average WriteReq miss latency
+system.cpu.dcache.WriteReq_avg_mshr_miss_latency  6571.666667                       # average WriteReq mshr miss latency
 system.cpu.dcache.WriteReq_hits                   224                       # number of WriteReq hits
-system.cpu.dcache.WriteReq_miss_latency        465332                       # number of WriteReq miss cycles
+system.cpu.dcache.WriteReq_miss_latency        465338                       # number of WriteReq miss cycles
 system.cpu.dcache.WriteReq_miss_rate         0.238095                       # miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_misses                  70                       # number of WriteReq misses
 system.cpu.dcache.WriteReq_mshr_hits               46                       # number of WriteReq MSHR hits
-system.cpu.dcache.WriteReq_mshr_miss_latency       157718                       # number of WriteReq MSHR miss cycles
+system.cpu.dcache.WriteReq_mshr_miss_latency       157720                       # number of WriteReq MSHR miss cycles
 system.cpu.dcache.WriteReq_mshr_miss_rate     0.081633                       # mshr miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_mshr_misses             24                       # number of WriteReq MSHR misses
 system.cpu.dcache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
-system.cpu.dcache.avg_blocked_cycles_no_targets  2980.125000                       # average number of cycles each access was blocked
-system.cpu.dcache.avg_refs                   8.164706                       # Average number of references to valid blocks.
+system.cpu.dcache.avg_blocked_cycles_no_targets         2980                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_refs                   8.141176                       # Average number of references to valid blocks.
 system.cpu.dcache.blocked_no_mshrs                  0                       # number of cycles access was blocked
 system.cpu.dcache.blocked_no_targets                8                       # number of cycles access was blocked
 system.cpu.dcache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
-system.cpu.dcache.blocked_cycles_no_targets        23841                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_targets        23840                       # number of cycles access was blocked
 system.cpu.dcache.cache_copies                      0                       # number of cache copies performed
-system.cpu.dcache.demand_accesses                 856                       # number of demand (read+write) accesses
-system.cpu.dcache.demand_avg_miss_latency  6991.981481                       # average overall miss latency
-system.cpu.dcache.demand_avg_mshr_miss_latency  7086.141176                       # average overall mshr miss latency
-system.cpu.dcache.demand_hits                     694                       # number of demand (read+write) hits
-system.cpu.dcache.demand_miss_latency         1132701                       # number of demand (read+write) miss cycles
-system.cpu.dcache.demand_miss_rate           0.189252                       # miss rate for demand accesses
+system.cpu.dcache.demand_accesses                 854                       # number of demand (read+write) accesses
+system.cpu.dcache.demand_avg_miss_latency  6979.500000                       # average overall miss latency
+system.cpu.dcache.demand_avg_mshr_miss_latency  7086.011765                       # average overall mshr miss latency
+system.cpu.dcache.demand_hits                     692                       # number of demand (read+write) hits
+system.cpu.dcache.demand_miss_latency         1130679                       # number of demand (read+write) miss cycles
+system.cpu.dcache.demand_miss_rate           0.189696                       # miss rate for demand accesses
 system.cpu.dcache.demand_misses                   162                       # number of demand (read+write) misses
 system.cpu.dcache.demand_mshr_hits                 77                       # number of demand (read+write) MSHR hits
-system.cpu.dcache.demand_mshr_miss_latency       602322                       # number of demand (read+write) MSHR miss cycles
-system.cpu.dcache.demand_mshr_miss_rate      0.099299                       # mshr miss rate for demand accesses
+system.cpu.dcache.demand_mshr_miss_latency       602311                       # number of demand (read+write) MSHR miss cycles
+system.cpu.dcache.demand_mshr_miss_rate      0.099532                       # mshr miss rate for demand accesses
 system.cpu.dcache.demand_mshr_misses               85                       # number of demand (read+write) MSHR misses
 system.cpu.dcache.fast_writes                       0                       # number of fast writes performed
 system.cpu.dcache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.dcache.no_allocate_misses                0                       # Number of misses that were no-allocate
-system.cpu.dcache.overall_accesses                856                       # number of overall (read+write) accesses
-system.cpu.dcache.overall_avg_miss_latency  6991.981481                       # average overall miss latency
-system.cpu.dcache.overall_avg_mshr_miss_latency  7086.141176                       # average overall mshr miss latency
+system.cpu.dcache.overall_accesses                854                       # number of overall (read+write) accesses
+system.cpu.dcache.overall_avg_miss_latency  6979.500000                       # average overall miss latency
+system.cpu.dcache.overall_avg_mshr_miss_latency  7086.011765                       # average overall mshr miss latency
 system.cpu.dcache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
-system.cpu.dcache.overall_hits                    694                       # number of overall hits
-system.cpu.dcache.overall_miss_latency        1132701                       # number of overall miss cycles
-system.cpu.dcache.overall_miss_rate          0.189252                       # miss rate for overall accesses
+system.cpu.dcache.overall_hits                    692                       # number of overall hits
+system.cpu.dcache.overall_miss_latency        1130679                       # number of overall miss cycles
+system.cpu.dcache.overall_miss_rate          0.189696                       # miss rate for overall accesses
 system.cpu.dcache.overall_misses                  162                       # number of overall misses
 system.cpu.dcache.overall_mshr_hits                77                       # number of overall MSHR hits
-system.cpu.dcache.overall_mshr_miss_latency       602322                       # number of overall MSHR miss cycles
-system.cpu.dcache.overall_mshr_miss_rate     0.099299                       # mshr miss rate for overall accesses
+system.cpu.dcache.overall_mshr_miss_latency       602311                       # number of overall MSHR miss cycles
+system.cpu.dcache.overall_mshr_miss_rate     0.099532                       # mshr miss rate for overall accesses
 system.cpu.dcache.overall_mshr_misses              85                       # number of overall MSHR misses
 system.cpu.dcache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.dcache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
@@ -121,88 +121,88 @@ system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.dcache.replacements                      0                       # number of replacements
 system.cpu.dcache.sampled_refs                     85                       # Sample count of references to valid blocks.
 system.cpu.dcache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.dcache.tagsinuse                 46.684937                       # Cycle average of tags in use
-system.cpu.dcache.total_refs                      694                       # Total number of references to valid blocks.
+system.cpu.dcache.tagsinuse                 46.684988                       # Cycle average of tags in use
+system.cpu.dcache.total_refs                      692                       # Total number of references to valid blocks.
 system.cpu.dcache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.dcache.writebacks                        0                       # number of writebacks
-system.cpu.decode.DECODE:BlockedCycles          21872                       # Number of cycles decode is blocked
+system.cpu.decode.DECODE:BlockedCycles          21865                       # Number of cycles decode is blocked
 system.cpu.decode.DECODE:BranchMispred             79                       # Number of times decode detected a branch misprediction
 system.cpu.decode.DECODE:BranchResolved           150                       # Number of times decode resolved a branch
-system.cpu.decode.DECODE:DecodedInsts            4868                       # Number of instructions handled by decode
-system.cpu.decode.DECODE:IdleCycles              5315                       # Number of cycles decode is idle
-system.cpu.decode.DECODE:RunCycles                925                       # Number of cycles decode is running
-system.cpu.decode.DECODE:SquashCycles             338                       # Number of cycles decode is squashing
+system.cpu.decode.DECODE:DecodedInsts            4900                       # Number of instructions handled by decode
+system.cpu.decode.DECODE:IdleCycles              5406                       # Number of cycles decode is idle
+system.cpu.decode.DECODE:RunCycles                928                       # Number of cycles decode is running
+system.cpu.decode.DECODE:SquashCycles             336                       # Number of cycles decode is squashing
 system.cpu.decode.DECODE:SquashedInsts            286                       # Number of squashed instructions handled by decode
 system.cpu.decode.DECODE:UnblockCycles              2                       # Number of cycles decode is unblocking
-system.cpu.fetch.Branches                         891                       # Number of branches that fetch encountered
-system.cpu.fetch.CacheLines                       814                       # Number of cache lines fetched
-system.cpu.fetch.Cycles                          1788                       # Number of cycles fetch has run and was not squashing or blocked
-system.cpu.fetch.IcacheSquashes                   145                       # Number of outstanding Icache misses that were squashed
-system.cpu.fetch.Insts                           5562                       # Number of instructions fetch has processed
-system.cpu.fetch.SquashCycles                     260                       # Number of cycles fetch has spent squashing
-system.cpu.fetch.branchRate                  0.031316                       # Number of branch fetches per cycle
-system.cpu.fetch.icacheStallCycles                814                       # Number of cycles fetch is stalled on an Icache miss
-system.cpu.fetch.predictedBranches                372                       # Number of branches that fetch has predicted taken
-system.cpu.fetch.rate                        0.195487                       # Number of inst fetches per cycle
+system.cpu.fetch.Branches                         898                       # Number of branches that fetch encountered
+system.cpu.fetch.CacheLines                       813                       # Number of cache lines fetched
+system.cpu.fetch.Cycles                          1774                       # Number of cycles fetch has run and was not squashing or blocked
+system.cpu.fetch.IcacheSquashes                   146                       # Number of outstanding Icache misses that were squashed
+system.cpu.fetch.Insts                           5593                       # Number of instructions fetch has processed
+system.cpu.fetch.SquashCycles                     258                       # Number of cycles fetch has spent squashing
+system.cpu.fetch.branchRate                  0.031468                       # Number of branch fetches per cycle
+system.cpu.fetch.icacheStallCycles                813                       # Number of cycles fetch is stalled on an Icache miss
+system.cpu.fetch.predictedBranches                371                       # Number of branches that fetch has predicted taken
+system.cpu.fetch.rate                        0.195991                       # Number of inst fetches per cycle
 system.cpu.fetch.rateDist.start_dist                           # Number of instructions fetched each cycle (Total)
-system.cpu.fetch.rateDist.samples               28452                      
+system.cpu.fetch.rateDist.samples               28537                      
 system.cpu.fetch.rateDist.min_value                 0                      
-                               0        27494   9663.29%           
-                               1           51     17.92%           
-                               2           92     32.34%           
-                               3           74     26.01%           
-                               4          117     41.12%           
-                               5           71     24.95%           
-                               6           43     15.11%           
-                               7           56     19.68%           
-                               8          454    159.57%           
+                               0        27576   9663.24%           
+                               1           50     17.52%           
+                               2           92     32.24%           
+                               3           74     25.93%           
+                               4          117     41.00%           
+                               5           71     24.88%           
+                               6           43     15.07%           
+                               7           56     19.62%           
+                               8          458    160.49%           
 system.cpu.fetch.rateDist.max_value                 8                      
 system.cpu.fetch.rateDist.end_dist
 
-system.cpu.icache.ReadReq_accesses                814                       # number of ReadReq accesses(hits+misses)
-system.cpu.icache.ReadReq_avg_miss_latency  4971.589641                       # average ReadReq miss latency
-system.cpu.icache.ReadReq_avg_mshr_miss_latency  4152.244565                       # average ReadReq mshr miss latency
-system.cpu.icache.ReadReq_hits                    563                       # number of ReadReq hits
-system.cpu.icache.ReadReq_miss_latency        1247869                       # number of ReadReq miss cycles
-system.cpu.icache.ReadReq_miss_rate          0.308354                       # miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_accesses                813                       # number of ReadReq accesses(hits+misses)
+system.cpu.icache.ReadReq_avg_miss_latency  4955.450199                       # average ReadReq miss latency
+system.cpu.icache.ReadReq_avg_mshr_miss_latency  4151.809783                       # average ReadReq mshr miss latency
+system.cpu.icache.ReadReq_hits                    562                       # number of ReadReq hits
+system.cpu.icache.ReadReq_miss_latency        1243818                       # number of ReadReq miss cycles
+system.cpu.icache.ReadReq_miss_rate          0.308733                       # miss rate for ReadReq accesses
 system.cpu.icache.ReadReq_misses                  251                       # number of ReadReq misses
 system.cpu.icache.ReadReq_mshr_hits                67                       # number of ReadReq MSHR hits
-system.cpu.icache.ReadReq_mshr_miss_latency       764013                       # number of ReadReq MSHR miss cycles
-system.cpu.icache.ReadReq_mshr_miss_rate     0.226044                       # mshr miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_mshr_miss_latency       763933                       # number of ReadReq MSHR miss cycles
+system.cpu.icache.ReadReq_mshr_miss_rate     0.226322                       # mshr miss rate for ReadReq accesses
 system.cpu.icache.ReadReq_mshr_misses             184                       # number of ReadReq MSHR misses
 system.cpu.icache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
 system.cpu.icache.avg_blocked_cycles_no_targets         3445                       # average number of cycles each access was blocked
-system.cpu.icache.avg_refs                   3.059783                       # Average number of references to valid blocks.
+system.cpu.icache.avg_refs                   3.054348                       # Average number of references to valid blocks.
 system.cpu.icache.blocked_no_mshrs                  0                       # number of cycles access was blocked
 system.cpu.icache.blocked_no_targets                4                       # number of cycles access was blocked
 system.cpu.icache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
 system.cpu.icache.blocked_cycles_no_targets        13780                       # number of cycles access was blocked
 system.cpu.icache.cache_copies                      0                       # number of cache copies performed
-system.cpu.icache.demand_accesses                 814                       # number of demand (read+write) accesses
-system.cpu.icache.demand_avg_miss_latency  4971.589641                       # average overall miss latency
-system.cpu.icache.demand_avg_mshr_miss_latency  4152.244565                       # average overall mshr miss latency
-system.cpu.icache.demand_hits                     563                       # number of demand (read+write) hits
-system.cpu.icache.demand_miss_latency         1247869                       # number of demand (read+write) miss cycles
-system.cpu.icache.demand_miss_rate           0.308354                       # miss rate for demand accesses
+system.cpu.icache.demand_accesses                 813                       # number of demand (read+write) accesses
+system.cpu.icache.demand_avg_miss_latency  4955.450199                       # average overall miss latency
+system.cpu.icache.demand_avg_mshr_miss_latency  4151.809783                       # average overall mshr miss latency
+system.cpu.icache.demand_hits                     562                       # number of demand (read+write) hits
+system.cpu.icache.demand_miss_latency         1243818                       # number of demand (read+write) miss cycles
+system.cpu.icache.demand_miss_rate           0.308733                       # miss rate for demand accesses
 system.cpu.icache.demand_misses                   251                       # number of demand (read+write) misses
 system.cpu.icache.demand_mshr_hits                 67                       # number of demand (read+write) MSHR hits
-system.cpu.icache.demand_mshr_miss_latency       764013                       # number of demand (read+write) MSHR miss cycles
-system.cpu.icache.demand_mshr_miss_rate      0.226044                       # mshr miss rate for demand accesses
+system.cpu.icache.demand_mshr_miss_latency       763933                       # number of demand (read+write) MSHR miss cycles
+system.cpu.icache.demand_mshr_miss_rate      0.226322                       # mshr miss rate for demand accesses
 system.cpu.icache.demand_mshr_misses              184                       # number of demand (read+write) MSHR misses
 system.cpu.icache.fast_writes                       0                       # number of fast writes performed
 system.cpu.icache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.icache.no_allocate_misses                0                       # Number of misses that were no-allocate
-system.cpu.icache.overall_accesses                814                       # number of overall (read+write) accesses
-system.cpu.icache.overall_avg_miss_latency  4971.589641                       # average overall miss latency
-system.cpu.icache.overall_avg_mshr_miss_latency  4152.244565                       # average overall mshr miss latency
+system.cpu.icache.overall_accesses                813                       # number of overall (read+write) accesses
+system.cpu.icache.overall_avg_miss_latency  4955.450199                       # average overall miss latency
+system.cpu.icache.overall_avg_mshr_miss_latency  4151.809783                       # average overall mshr miss latency
 system.cpu.icache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
-system.cpu.icache.overall_hits                    563                       # number of overall hits
-system.cpu.icache.overall_miss_latency        1247869                       # number of overall miss cycles
-system.cpu.icache.overall_miss_rate          0.308354                       # miss rate for overall accesses
+system.cpu.icache.overall_hits                    562                       # number of overall hits
+system.cpu.icache.overall_miss_latency        1243818                       # number of overall miss cycles
+system.cpu.icache.overall_miss_rate          0.308733                       # miss rate for overall accesses
 system.cpu.icache.overall_misses                  251                       # number of overall misses
 system.cpu.icache.overall_mshr_hits                67                       # number of overall MSHR hits
-system.cpu.icache.overall_mshr_miss_latency       764013                       # number of overall MSHR miss cycles
-system.cpu.icache.overall_mshr_miss_rate     0.226044                       # mshr miss rate for overall accesses
+system.cpu.icache.overall_mshr_miss_latency       763933                       # number of overall MSHR miss cycles
+system.cpu.icache.overall_mshr_miss_rate     0.226322                       # mshr miss rate for overall accesses
 system.cpu.icache.overall_mshr_misses             184                       # number of overall MSHR misses
 system.cpu.icache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.icache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
@@ -218,59 +218,59 @@ system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.icache.replacements                      0                       # number of replacements
 system.cpu.icache.sampled_refs                    184                       # Sample count of references to valid blocks.
 system.cpu.icache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.icache.tagsinuse                 91.596526                       # Cycle average of tags in use
-system.cpu.icache.total_refs                      563                       # Total number of references to valid blocks.
+system.cpu.icache.tagsinuse                 91.596649                       # Cycle average of tags in use
+system.cpu.icache.total_refs                      562                       # Total number of references to valid blocks.
 system.cpu.icache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.icache.writebacks                        0                       # number of writebacks
-system.cpu.idleCycles                          723576                       # Total number of cycles that the CPU has spent unscheduled due to idling
-system.cpu.iew.EXEC:branches                      571                       # Number of branches executed
-system.cpu.iew.EXEC:nop                           266                       # number of nop insts executed
-system.cpu.iew.EXEC:rate                     0.119043                       # Inst execution rate
-system.cpu.iew.EXEC:refs                         1018                       # number of memory reference insts executed
-system.cpu.iew.EXEC:stores                        343                       # Number of stores executed
+system.cpu.idleCycles                          723492                       # Total number of cycles that the CPU has spent unscheduled due to idling
+system.cpu.iew.EXEC:branches                      566                       # Number of branches executed
+system.cpu.iew.EXEC:nop                           267                       # number of nop insts executed
+system.cpu.iew.EXEC:rate                     0.118022                       # Inst execution rate
+system.cpu.iew.EXEC:refs                         1013                       # number of memory reference insts executed
+system.cpu.iew.EXEC:stores                        341                       # Number of stores executed
 system.cpu.iew.EXEC:swp                             0                       # number of swp insts executed
-system.cpu.iew.WB:consumers                      1875                       # num instructions consuming a value
-system.cpu.iew.WB:count                          3246                       # cumulative count of insts written-back
-system.cpu.iew.WB:fanout                     0.785067                       # average fanout of values written-back
+system.cpu.iew.WB:consumers                      1860                       # num instructions consuming a value
+system.cpu.iew.WB:count                          3219                       # cumulative count of insts written-back
+system.cpu.iew.WB:fanout                     0.785484                       # average fanout of values written-back
 system.cpu.iew.WB:penalized                         0                       # number of instrctions required to write to 'other' IQ
 system.cpu.iew.WB:penalized_rate                    0                       # fraction of instructions written-back that wrote to 'other' IQ
-system.cpu.iew.WB:producers                      1472                       # num instructions producing a value
-system.cpu.iew.WB:rate                       0.114087                       # insts written-back per cycle
-system.cpu.iew.WB:sent                           3258                       # cumulative count of insts sent to commit
-system.cpu.iew.branchMispredicts                  160                       # Number of branch mispredicts detected at execute
-system.cpu.iew.iewBlockCycles                   14741                       # Number of cycles IEW is blocking
-system.cpu.iew.iewDispLoadInsts                   784                       # Number of dispatched load instructions
+system.cpu.iew.WB:producers                      1461                       # num instructions producing a value
+system.cpu.iew.WB:rate                       0.112801                       # insts written-back per cycle
+system.cpu.iew.WB:sent                           3234                       # cumulative count of insts sent to commit
+system.cpu.iew.branchMispredicts                  152                       # Number of branch mispredicts detected at execute
+system.cpu.iew.iewBlockCycles                   14742                       # Number of cycles IEW is blocking
+system.cpu.iew.iewDispLoadInsts                   783                       # Number of dispatched load instructions
 system.cpu.iew.iewDispNonSpecInsts                  6                       # Number of dispatched non-speculative instructions
-system.cpu.iew.iewDispSquashedInsts                71                       # Number of squashed instructions skipped by dispatch
-system.cpu.iew.iewDispStoreInsts                  376                       # Number of dispatched store instructions
-system.cpu.iew.iewDispatchedInsts                4271                       # Number of instructions dispatched to IQ
-system.cpu.iew.iewExecLoadInsts                   675                       # Number of load instructions executed
-system.cpu.iew.iewExecSquashedInsts               113                       # Number of squashed instructions skipped in execute
-system.cpu.iew.iewExecutedInsts                  3387                       # Number of executed instructions
-system.cpu.iew.iewIQFullEvents                      9                       # Number of times the IQ has become full, causing a stall
+system.cpu.iew.iewDispSquashedInsts                79                       # Number of squashed instructions skipped by dispatch
+system.cpu.iew.iewDispStoreInsts                  381                       # Number of dispatched store instructions
+system.cpu.iew.iewDispatchedInsts                4280                       # Number of instructions dispatched to IQ
+system.cpu.iew.iewExecLoadInsts                   672                       # Number of load instructions executed
+system.cpu.iew.iewExecSquashedInsts               123                       # Number of squashed instructions skipped in execute
+system.cpu.iew.iewExecutedInsts                  3368                       # Number of executed instructions
+system.cpu.iew.iewIQFullEvents                      8                       # Number of times the IQ has become full, causing a stall
 system.cpu.iew.iewIdleCycles                        0                       # Number of cycles IEW is idle
 system.cpu.iew.iewLSQFullEvents                     0                       # Number of times the LSQ has become full, causing a stall
-system.cpu.iew.iewSquashCycles                    338                       # Number of cycles IEW is squashing
-system.cpu.iew.iewUnblockCycles                    13                       # Number of cycles IEW is unblocking
+system.cpu.iew.iewSquashCycles                    336                       # Number of cycles IEW is squashing
+system.cpu.iew.iewUnblockCycles                    12                       # Number of cycles IEW is unblocking
 system.cpu.iew.lsq.thread.0.blockedLoads            0                       # Number of blocked loads due to partial load-store forwarding
 system.cpu.iew.lsq.thread.0.cacheBlocked           82                       # Number of times an access to memory failed due to the cache being blocked
 system.cpu.iew.lsq.thread.0.forwLoads              29                       # Number of loads that had data forwarded from stores
 system.cpu.iew.lsq.thread.0.ignoredResponses            0                       # Number of memory responses ignored because the instruction is squashed
 system.cpu.iew.lsq.thread.0.invAddrLoads            0                       # Number of loads ignored due to an invalid address
 system.cpu.iew.lsq.thread.0.invAddrSwpfs            0                       # Number of software prefetches ignored due to an invalid address
-system.cpu.iew.lsq.thread.0.memOrderViolation           11                       # Number of memory ordering violations
+system.cpu.iew.lsq.thread.0.memOrderViolation           12                       # Number of memory ordering violations
 system.cpu.iew.lsq.thread.0.rescheduledLoads            0                       # Number of loads that were rescheduled
-system.cpu.iew.lsq.thread.0.squashedLoads          369                       # Number of loads squashed
-system.cpu.iew.lsq.thread.0.squashedStores           82                       # Number of stores squashed
-system.cpu.iew.memOrderViolationEvents             11                       # Number of memory order violations
-system.cpu.iew.predictedNotTakenIncorrect           99                       # Number of branches that were predicted not taken incorrectly
-system.cpu.iew.predictedTakenIncorrect             61                       # Number of branches that were predicted taken incorrectly
+system.cpu.iew.lsq.thread.0.squashedLoads          368                       # Number of loads squashed
+system.cpu.iew.lsq.thread.0.squashedStores           87                       # Number of stores squashed
+system.cpu.iew.memOrderViolationEvents             12                       # Number of memory order violations
+system.cpu.iew.predictedNotTakenIncorrect           96                       # Number of branches that were predicted not taken incorrectly
+system.cpu.iew.predictedTakenIncorrect             56                       # Number of branches that were predicted taken incorrectly
 system.cpu.ipc                               0.003174                       # IPC: Instructions Per Cycle
 system.cpu.ipc_total                         0.003174                       # IPC: Total IPC of All Threads
-system.cpu.iq.ISSUE:FU_type_0                    3500                       # Type of FU issued
+system.cpu.iq.ISSUE:FU_type_0                    3491                       # Type of FU issued
 system.cpu.iq.ISSUE:FU_type_0.start_dist
                           (null)            0      0.00%            # Type of FU issued
-                          IntAlu         2460     70.29%            # Type of FU issued
+                          IntAlu         2447     70.09%            # Type of FU issued
                          IntMult            1      0.03%            # Type of FU issued
                           IntDiv            0      0.00%            # Type of FU issued
                         FloatAdd            0      0.00%            # Type of FU issued
@@ -279,16 +279,16 @@ system.cpu.iq.ISSUE:FU_type_0.start_dist
                        FloatMult            0      0.00%            # Type of FU issued
                         FloatDiv            0      0.00%            # Type of FU issued
                        FloatSqrt            0      0.00%            # Type of FU issued
-                         MemRead          695     19.86%            # Type of FU issued
-                        MemWrite          344      9.83%            # Type of FU issued
+                         MemRead          694     19.88%            # Type of FU issued
+                        MemWrite          349     10.00%            # Type of FU issued
                        IprAccess            0      0.00%            # Type of FU issued
                     InstPrefetch            0      0.00%            # Type of FU issued
 system.cpu.iq.ISSUE:FU_type_0.end_dist
-system.cpu.iq.ISSUE:fu_busy_cnt                    35                       # FU busy when requested
-system.cpu.iq.ISSUE:fu_busy_rate             0.010000                       # FU busy rate (busy events/executed inst)
+system.cpu.iq.ISSUE:fu_busy_cnt                    34                       # FU busy when requested
+system.cpu.iq.ISSUE:fu_busy_rate             0.009739                       # FU busy rate (busy events/executed inst)
 system.cpu.iq.ISSUE:fu_full.start_dist
                           (null)            0      0.00%            # attempts to use FU when none available
-                          IntAlu            2      5.71%            # attempts to use FU when none available
+                          IntAlu            1      2.94%            # attempts to use FU when none available
                          IntMult            0      0.00%            # attempts to use FU when none available
                           IntDiv            0      0.00%            # attempts to use FU when none available
                         FloatAdd            0      0.00%            # attempts to use FU when none available
@@ -297,41 +297,41 @@ system.cpu.iq.ISSUE:fu_full.start_dist
                        FloatMult            0      0.00%            # attempts to use FU when none available
                         FloatDiv            0      0.00%            # attempts to use FU when none available
                        FloatSqrt            0      0.00%            # attempts to use FU when none available
-                         MemRead           11     31.43%            # attempts to use FU when none available
-                        MemWrite           22     62.86%            # attempts to use FU when none available
+                         MemRead           11     32.35%            # attempts to use FU when none available
+                        MemWrite           22     64.71%            # attempts to use FU when none available
                        IprAccess            0      0.00%            # attempts to use FU when none available
                     InstPrefetch            0      0.00%            # attempts to use FU when none available
 system.cpu.iq.ISSUE:fu_full.end_dist
 system.cpu.iq.ISSUE:issued_per_cycle.start_dist                     # Number of insts issued each cycle
-system.cpu.iq.ISSUE:issued_per_cycle.samples        28452                      
+system.cpu.iq.ISSUE:issued_per_cycle.samples        28537                      
 system.cpu.iq.ISSUE:issued_per_cycle.min_value            0                      
-                               0        26938   9467.88%           
-                               1          609    214.04%           
-                               2          344    120.91%           
-                               3          248     87.16%           
-                               4          180     63.26%           
-                               5           81     28.47%           
-                               6           35     12.30%           
-                               7           12      4.22%           
-                               8            5      1.76%           
+                               0        27012   9465.61%           
+                               1          616    215.86%           
+                               2          356    124.75%           
+                               3          247     86.55%           
+                               4          177     62.02%           
+                               5           81     28.38%           
+                               6           32     11.21%           
+                               7           11      3.85%           
+                               8            5      1.75%           
 system.cpu.iq.ISSUE:issued_per_cycle.max_value            8                      
 system.cpu.iq.ISSUE:issued_per_cycle.end_dist
 
-system.cpu.iq.ISSUE:rate                     0.123014                       # Inst issue rate
-system.cpu.iq.iqInstsAdded                       3999                       # Number of instructions added to the IQ (excludes non-spec)
-system.cpu.iq.iqInstsIssued                      3500                       # Number of instructions issued
+system.cpu.iq.ISSUE:rate                     0.122332                       # Inst issue rate
+system.cpu.iq.iqInstsAdded                       4007                       # Number of instructions added to the IQ (excludes non-spec)
+system.cpu.iq.iqInstsIssued                      3491                       # Number of instructions issued
 system.cpu.iq.iqNonSpecInstsAdded                   6                       # Number of non-speculative instructions added to the IQ
-system.cpu.iq.iqSquashedInstsExamined            1423                       # Number of squashed instructions iterated over during squash; mainly for profiling
+system.cpu.iq.iqSquashedInstsExamined            1470                       # Number of squashed instructions iterated over during squash; mainly for profiling
 system.cpu.iq.iqSquashedInstsIssued                25                       # Number of squashed instructions issued
 system.cpu.iq.iqSquashedNonSpecRemoved              2                       # Number of squashed non-spec instructions that were removed
-system.cpu.iq.iqSquashedOperandsExamined          761                       # Number of squashed operands that are examined and possibly removed from graph
+system.cpu.iq.iqSquashedOperandsExamined          801                       # Number of squashed operands that are examined and possibly removed from graph
 system.cpu.l2cache.ReadReq_accesses               269                       # number of ReadReq accesses(hits+misses)
-system.cpu.l2cache.ReadReq_avg_miss_latency  4622.063197                       # average ReadReq miss latency
-system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2296.591078                       # average ReadReq mshr miss latency
-system.cpu.l2cache.ReadReq_miss_latency       1243335                       # number of ReadReq miss cycles
+system.cpu.l2cache.ReadReq_avg_miss_latency  4621.724907                       # average ReadReq miss latency
+system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2296.401487                       # average ReadReq mshr miss latency
+system.cpu.l2cache.ReadReq_miss_latency       1243244                       # number of ReadReq miss cycles
 system.cpu.l2cache.ReadReq_miss_rate                1                       # miss rate for ReadReq accesses
 system.cpu.l2cache.ReadReq_misses                 269                       # number of ReadReq misses
-system.cpu.l2cache.ReadReq_mshr_miss_latency       617783                       # number of ReadReq MSHR miss cycles
+system.cpu.l2cache.ReadReq_mshr_miss_latency       617732                       # number of ReadReq MSHR miss cycles
 system.cpu.l2cache.ReadReq_mshr_miss_rate            1                       # mshr miss rate for ReadReq accesses
 system.cpu.l2cache.ReadReq_mshr_misses            269                       # number of ReadReq MSHR misses
 system.cpu.l2cache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
@@ -343,29 +343,29 @@ system.cpu.l2cache.blocked_cycles_no_mshrs            0                       #
 system.cpu.l2cache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
 system.cpu.l2cache.cache_copies                     0                       # number of cache copies performed
 system.cpu.l2cache.demand_accesses                269                       # number of demand (read+write) accesses
-system.cpu.l2cache.demand_avg_miss_latency  4622.063197                       # average overall miss latency
-system.cpu.l2cache.demand_avg_mshr_miss_latency  2296.591078                       # average overall mshr miss latency
+system.cpu.l2cache.demand_avg_miss_latency  4621.724907                       # average overall miss latency
+system.cpu.l2cache.demand_avg_mshr_miss_latency  2296.401487                       # average overall mshr miss latency
 system.cpu.l2cache.demand_hits                      0                       # number of demand (read+write) hits
-system.cpu.l2cache.demand_miss_latency        1243335                       # number of demand (read+write) miss cycles
+system.cpu.l2cache.demand_miss_latency        1243244                       # number of demand (read+write) miss cycles
 system.cpu.l2cache.demand_miss_rate                 1                       # miss rate for demand accesses
 system.cpu.l2cache.demand_misses                  269                       # number of demand (read+write) misses
 system.cpu.l2cache.demand_mshr_hits                 0                       # number of demand (read+write) MSHR hits
-system.cpu.l2cache.demand_mshr_miss_latency       617783                       # number of demand (read+write) MSHR miss cycles
+system.cpu.l2cache.demand_mshr_miss_latency       617732                       # number of demand (read+write) MSHR miss cycles
 system.cpu.l2cache.demand_mshr_miss_rate            1                       # mshr miss rate for demand accesses
 system.cpu.l2cache.demand_mshr_misses             269                       # number of demand (read+write) MSHR misses
 system.cpu.l2cache.fast_writes                      0                       # number of fast writes performed
 system.cpu.l2cache.mshr_cap_events                  0                       # number of times MSHR cap was activated
 system.cpu.l2cache.no_allocate_misses               0                       # Number of misses that were no-allocate
 system.cpu.l2cache.overall_accesses               269                       # number of overall (read+write) accesses
-system.cpu.l2cache.overall_avg_miss_latency  4622.063197                       # average overall miss latency
-system.cpu.l2cache.overall_avg_mshr_miss_latency  2296.591078                       # average overall mshr miss latency
+system.cpu.l2cache.overall_avg_miss_latency  4621.724907                       # average overall miss latency
+system.cpu.l2cache.overall_avg_mshr_miss_latency  2296.401487                       # average overall mshr miss latency
 system.cpu.l2cache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
 system.cpu.l2cache.overall_hits                     0                       # number of overall hits
-system.cpu.l2cache.overall_miss_latency       1243335                       # number of overall miss cycles
+system.cpu.l2cache.overall_miss_latency       1243244                       # number of overall miss cycles
 system.cpu.l2cache.overall_miss_rate                1                       # miss rate for overall accesses
 system.cpu.l2cache.overall_misses                 269                       # number of overall misses
 system.cpu.l2cache.overall_mshr_hits                0                       # number of overall MSHR hits
-system.cpu.l2cache.overall_mshr_miss_latency       617783                       # number of overall MSHR miss cycles
+system.cpu.l2cache.overall_mshr_miss_latency       617732                       # number of overall MSHR miss cycles
 system.cpu.l2cache.overall_mshr_miss_rate            1                       # mshr miss rate for overall accesses
 system.cpu.l2cache.overall_mshr_misses            269                       # number of overall MSHR misses
 system.cpu.l2cache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
@@ -382,25 +382,25 @@ system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.l2cache.replacements                     0                       # number of replacements
 system.cpu.l2cache.sampled_refs                   269                       # Sample count of references to valid blocks.
 system.cpu.l2cache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.l2cache.tagsinuse               138.802720                       # Cycle average of tags in use
+system.cpu.l2cache.tagsinuse               138.802893                       # Cycle average of tags in use
 system.cpu.l2cache.total_refs                       0                       # Total number of references to valid blocks.
 system.cpu.l2cache.warmup_cycle                     0                       # Cycle when the warmup percentage was hit.
 system.cpu.l2cache.writebacks                       0                       # number of writebacks
-system.cpu.numCycles                            28452                       # number of cpu cycles simulated
-system.cpu.rename.RENAME:BlockCycles            14785                       # Number of cycles rename is blocking
+system.cpu.numCycles                            28537                       # number of cpu cycles simulated
+system.cpu.rename.RENAME:BlockCycles            14783                       # Number of cycles rename is blocking
 system.cpu.rename.RENAME:CommittedMaps           1768                       # Number of HB maps that are committed
 system.cpu.rename.RENAME:IQFullEvents              18                       # Number of times rename has blocked due to IQ full
-system.cpu.rename.RENAME:IdleCycles              5396                       # Number of cycles rename is idle
+system.cpu.rename.RENAME:IdleCycles              5489                       # Number of cycles rename is idle
 system.cpu.rename.RENAME:LSQFullEvents              1                       # Number of times rename has blocked due to LSQ full
 system.cpu.rename.RENAME:ROBFullEvents              2                       # Number of times rename has blocked due to ROB full
-system.cpu.rename.RENAME:RenameLookups           5263                       # Number of register rename lookups that rename has made
-system.cpu.rename.RENAME:RenamedInsts            4690                       # Number of instructions processed by rename
-system.cpu.rename.RENAME:RenamedOperands         3393                       # Number of destination operands rename has renamed
-system.cpu.rename.RENAME:RunCycles                851                       # Number of cycles rename is running
-system.cpu.rename.RENAME:SquashCycles             338                       # Number of cycles rename is squashing
+system.cpu.rename.RENAME:RenameLookups           5285                       # Number of register rename lookups that rename has made
+system.cpu.rename.RENAME:RenamedInsts            4708                       # Number of instructions processed by rename
+system.cpu.rename.RENAME:RenamedOperands         3399                       # Number of destination operands rename has renamed
+system.cpu.rename.RENAME:RunCycles                852                       # Number of cycles rename is running
+system.cpu.rename.RENAME:SquashCycles             336                       # Number of cycles rename is squashing
 system.cpu.rename.RENAME:UnblockCycles             25                       # Number of cycles rename is unblocking
-system.cpu.rename.RENAME:UndoneMaps              1625                       # Number of HB maps that are undone due to squashing
-system.cpu.rename.RENAME:serializeStallCycles         7057                       # count of cycles rename stalled for serializing inst
+system.cpu.rename.RENAME:UndoneMaps              1631                       # Number of HB maps that are undone due to squashing
+system.cpu.rename.RENAME:serializeStallCycles         7052                       # count of cycles rename stalled for serializing inst
 system.cpu.rename.RENAME:serializingInsts            8                       # count of serializing insts renamed
 system.cpu.rename.RENAME:skidInsts                 88                       # count of insts added to the skid buffer
 system.cpu.rename.RENAME:tempSerializingInsts            6                       # count of temporary serializing insts renamed
diff --git a/tests/run.py b/tests/run.py
index a405b7f69..df34faca8 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -42,6 +42,13 @@ def binpath(app, file=None):
         file = app
     return os.path.join(test_progs, app, 'bin', isa, opsys, file)
 
+# generate path to input file
+def inputpath(app, file=None):
+    # input file has same name as app unless specified otherwise
+    if not file:
+        file = app
+    return os.path.join(test_progs, app, 'input', file)
+
 # build configuration
 execfile(os.path.join(tests_root, 'configs', config + '.py'))