29 files changed, 2197 insertions, 310 deletions
diff --git a/build_opts/SPARC_SE b/build_opts/SPARC_SE
index 62b6841ad..b288d3908 100644
--- a/build_opts/SPARC_SE
+++ b/build_opts/SPARC_SE
@@ -1,3 +1,3 @@
 TARGET_ISA = 'sparc'
-CPU_MODELS = 'AtomicSimpleCPU,TimingSimpleCPU'
+CPU_MODELS = 'AtomicSimpleCPU,TimingSimpleCPU,O3CPU'
 FULL_SYSTEM = 0
diff --git a/src/arch/sparc/isa/formats/mem/swap.isa b/src/arch/sparc/isa/formats/mem/swap.isa
index 818597a84..b71542a2b 100644
--- a/src/arch/sparc/isa/formats/mem/swap.isa
+++ b/src/arch/sparc/isa/formats/mem/swap.isa
@@ -137,7 +137,7 @@ def format Swap(code, postacc_code, mem_flags, *opt_flags) {{
      decoder_output,
      exec_output,
      decode_block) = doMemFormat(code, SwapFuncs, '', name, Name, flags,
-         opt_flags, postacc_code)
+         ["IsStoreConditional"], postacc_code)
 }};
 
 def format SwapAlt(code, postacc_code, asi, mem_flags, *opt_flags) {{
@@ -148,7 +148,7 @@ def format SwapAlt(code, postacc_code, asi, mem_flags, *opt_flags) {{
      decoder_output,
      exec_output,
      decode_block) = doMemFormat(code, SwapFuncs, AlternateASIPrivFaultCheck,
-         name, Name, flags, opt_flags, postacc_code)
+         name, Name, flags, ["IsStoreConditional"], postacc_code)
 }};
 
 
@@ -163,8 +163,8 @@ let {{
         decode_block = BasicDecode.subst(iop)
         microParams = {"code": code, "postacc_code" : postacc_code,
             "ea_code" : addrCalcReg, "fault_check" : faultCode}
-        exec_output = doSplitExecute(execute, name, Name, asi, opt_flags,
-                microParams);
+        exec_output = doSplitExecute(execute, name, Name, asi,
+                ["IsStoreConditional"], microParams);
         return (header_output, decoder_output, exec_output, decode_block)
 }};
 
@@ -177,7 +177,7 @@ def format CasAlt(code, postacc_code, asi, mem_flags, *opt_flags) {{
      decoder_output,
      exec_output,
      decode_block) = doCasFormat(code, SwapFuncs, AlternateASIPrivFaultCheck,
-         name, Name, flags, opt_flags, postacc_code)
+         name, Name, flags, ["IsStoreConditional"], postacc_code)
 }};
 
 
diff --git a/src/arch/x86/isa/base.isa b/src/arch/x86/isa/base.isa
index 4776f7a7e..cd166b306 100644
--- a/src/arch/x86/isa/base.isa
+++ b/src/arch/x86/isa/base.isa
@@ -79,6 +79,13 @@ output header {{
             void printReg(std::ostream &os, int reg) const;
             void printSrcReg(std::ostream &os, int reg) const;
             void printDestReg(std::ostream &os, int reg) const;
+
+            inline uint64_t merge(uint64_t into, uint64_t val, int size) const
+            {
+                //FIXME This needs to be significantly more sophisticated
+                return val;
+            }
+
         };
 }};
 
diff --git a/src/arch/x86/isa/formats/formats.isa b/src/arch/x86/isa/formats/formats.isa
index f4e5c402f..d763c05bc 100644
--- a/src/arch/x86/isa/formats/formats.isa
+++ b/src/arch/x86/isa/formats/formats.isa
@@ -95,9 +95,6 @@
 //malfunction of the decode mechanism.
 ##include "error.isa"
 
-//Include code to build up macro op instructions
-##include "macroop.isa"
-
 //Include a format which implements a batch of instructions which do the same
 //thing on a variety of inputs
 ##include "multi.isa"
diff --git a/src/arch/x86/isa/formats/macroop.isa b/src/arch/x86/isa/macroop.isa
index 717103df1..7d41a2dea 100644
--- a/src/arch/x86/isa/formats/macroop.isa
+++ b/src/arch/x86/isa/macroop.isa
@@ -55,16 +55,20 @@
 //
 // Authors: Gabe Black
 
-////////////////////////////////////////////////////////////////////
-//
-// Instructions that do the same thing to multiple sets of arguments.
-//
+// Execute method for macroops.
+def template MacroExecPanic {{
+        Fault execute(%(CPU_exec_context)s *, Trace::InstRecord *) const
+        {
+            panic("Tried to execute macroop directly!");
+            M5_DUMMY_RETURN
+        }
+}};
 
 output header {{
 
         // Base class for most macroops, except ones that need to commit as
         // they go.
-        class X86MacroInst : public X86StaticInst
+        class X86MacroInst : public StaticInst
         {
           protected:
             const uint32_t numMicroOps;
@@ -72,7 +76,7 @@ output header {{
             //Constructor.
             X86MacroInst(const char *mnem, ExtMachInst _machInst,
                     uint32_t _numMicroOps)
-                        : X86StaticInst(mnem, _machInst, No_OpClass),
+                        : StaticInst(mnem, _machInst, No_OpClass),
                         numMicroOps(_numMicroOps)
             {
                 assert(numMicroOps);
@@ -85,9 +89,6 @@ output header {{
                 delete [] microOps;
             }
 
-            std::string generateDisassembly(Addr pc,
-                const SymbolTable *symtab) const;
-
             StaticInstPtr * microOps;
 
             StaticInstPtr fetchMicroOp(MicroPC microPC)
@@ -96,21 +97,7 @@ output header {{
                 return microOps[microPC];
             }
 
-            %(BasicExecPanic)s
-        };
-
-        // Base class for macroops which commit as they go. This is for
-        // instructions which can be partially completed like those with the
-        // rep prefix. This prevents those instructions from overflowing
-        // buffers with uncommitted microops.
-        class X86RollingMacroInst : public X86MacroInst
-        {
-          protected:
-            //Constructor.
-            X86RollingMacroInst(const char *mnem, ExtMachInst _machInst,
-                    uint32_t _numMicroOps)
-                        : X86MacroInst(mnem, _machInst, numMicroOps)
-            {}
+            %(MacroExecPanic)s
         };
 }};
 
@@ -121,34 +108,24 @@ def template MacroConstructor {{
         {
                 %(constructor)s;
                 //alloc_micro_ops is the code that sets up the microOps
-                //array in the parent class. This hook will hopefully
-                //allow all that to be automated.
+                //array in the parent class.
                 %(alloc_micro_ops)s;
-                setMicroFlags();
         }
 }};
 
 let {{
-    def genMacroOp(name, Name, ops, rolling = False):
+    def genMacroOp(name, Name, opSeq):
         baseClass = 'X86MacroInst'
-        if rolling:
-            baseClass = 'X86RollingMacroInst'
-        numMicroOps = len(ops)
+        numMicroOps = len(opSeq.ops)
         allocMicroOps = ''
         micropc = 0
-        allocMicroOps += \
-            "microOps[0] = %s;\n" % \
-            op.getAllocator(True, not rolling, True, False)
-        micropc += 1
-        if numMicroOps > 2:
-            for op in ops[1:-1]:
-                allocMicroOps += \
-                    "microOps[%d] = %s;\n" % \
-                    (micropc, op.getAllocator(True, not rolling, False, False))
-                micropc += 1
-        allocMicroOps += \
-            "microOps[%d] = %s;\n" % \
-            op.getAllocator(True, not rolling, False, True)
+        for op in opSeq.ops:
+            allocMicroOps += \
+                "microOps[%d] = %s;\n" % \
+                (micropc, op.getAllocator(True, op.delayed,
+                                          micropc == 0,
+                                          micropc == numMicroOps - 1))
+            micropc += 1
         iop = InstObjParams(name, Name, baseClass,
                 {'code' : '', 'num_micro_ops' : numMicroOps,
                 'alloc_micro_ops' : allocMicroOps})
diff --git a/src/arch/x86/isa/main.isa b/src/arch/x86/isa/main.isa
index cc3a9bee4..063d7125d 100644
--- a/src/arch/x86/isa/main.isa
+++ b/src/arch/x86/isa/main.isa
@@ -72,26 +72,55 @@
 
 namespace X86ISA;
 
-//Include the simple microcode assembler
-##include "microasm.isa"
+////////////////////////////////////////////////////////////////////
+//
+// General infrastructure code. These files provide infrastructure
+// which was developed to support x86 but isn't specific to it.
+//
 
-//Include the bitfield definitions
-##include "bitfields.isa"
+//Include code to build macroops.
+##include "macroop.isa"
 
-//Include the operand_types and operand definitions
-##include "operands.isa"
+//Include the simple microcode assembler. This will hopefully stay
+//unspecialized for x86 and can later be made available to other ISAs.
+##include "microasm.isa"
+
+////////////////////////////////////////////////////////////////////
+//
+// X86 only infrastructure code.
+//
 
-//Include the base class for x86 instructions, and some support code
+//Include the base class for x86 instructions, and some support code.
 ##include "base.isa"
 
-//Include the instruction definitions
-##include "insts/insts.isa"
+//Include code to specialize an instruction template to operate on
+//a particular set of operands. This is specific to x86 and the x86
+//microcode ISA.
+##include "specialize.isa"
+
+////////////////////////////////////////////////////////////////////
+//
+// Code which directly specifies isa components like instructions
+// microops, and the decoder.
+//
 
 //Include the definitions for the instruction formats
 ##include "formats/formats.isa"
 
-//Include the definitions of the micro ops
+//Include the operand_types and operand definitions. These are needed by
+//the microop definitions.
+##include "operands.isa"
+
+//Include the definitions of the micro ops.
+//These are StaticInst classes which stand on their own and make up an
+//internal instruction set.
 ##include "microops/microops.isa"
 
+//Include the instruction definitions which are microop assembler programs.
+##include "insts/insts.isa"
+
+//Include the bitfield definitions
+##include "bitfields.isa"
+
 //Include the decoder definition
 ##include "decoder/decoder.isa"
diff --git a/src/arch/x86/isa/microasm.isa b/src/arch/x86/isa/microasm.isa
index b94b55aab..23567aae9 100644
--- a/src/arch/x86/isa/microasm.isa
+++ b/src/arch/x86/isa/microasm.isa
@@ -57,152 +57,17 @@
 
 ////////////////////////////////////////////////////////////////////
 //
-//  Code to "specialize" a microcode sequence to use a particular
-//  variety of operands
+//  The microcode assembler
 //
 
 let {{
-    # This builds either a regular or macro op to implement the sequence of
-    # ops we give it.
-    def genInst(name, Name, ops):
-        # If we can implement this instruction with exactly one microop, just
-        # use that directly.
-        newStmnt = ''
-        if len(ops) == 1:
-            decode_block = "return (X86StaticInst *)(%s);" % \
-                            ops[0].getAllocator()
-            return ('', '', decode_block, '')
-        else:
-            # Build a macroop to contain the sequence of microops we've
-            # been given.
-            return genMacroOp(name, Name, ops)
-}};
-
-let {{
-    # This code builds up a decode block which decodes based on switchval.
-    # vals is a dict which matches case values with what should be decoded to.
-    # builder is called on the exploded contents of "vals" values to generate
-    # whatever code should be used.
-    def doSplitDecode(name, Name, builder, switchVal, vals, default = None):
-        header_output = ''
-        decoder_output = ''
-        decode_block = 'switch(%s) {\n' % switchVal
-        exec_output = ''
-        for (val, todo) in vals.items():
-            (new_header_output,
-             new_decoder_output,
-             new_decode_block,
-             new_exec_output) = builder(name, Name, *todo)
-            header_output += new_header_output
-            decoder_output += new_decoder_output
-            decode_block += '\tcase %s: %s\n' % (val, new_decode_block)
-            exec_output += new_exec_output
-        if default:
-            (new_header_output,
-             new_decoder_output,
-             new_decode_block,
-             new_exec_output) = builder(name, Name, *default)
-            header_output += new_header_output
-            decoder_output += new_decoder_output
-            decode_block += '\tdefault: %s\n' % new_decode_block
-            exec_output += new_exec_output
-        decode_block += '}\n'
-        return (header_output, decoder_output, decode_block, exec_output)
-}};
-
-let {{
-    class OpType(object):
-        parser = re.compile(r"(?P<tag>[A-Z][A-Z]*)(?P<size>[a-z][a-z]*)|(r(?P<reg>[A-Za-z0-9][A-Za-z0-9]*))")
-        def __init__(self, opTypeString):
-            match = OpType.parser.search(opTypeString)
-            if match == None:
-                raise Exception, "Problem parsing operand type %s" % opTypeString
-            self.reg = match.group("reg")
-            self.tag = match.group("tag")
-            self.size = match.group("size")
+    # These are used when setting up microops so that they can specialize their
+    # base class template properly.
+    RegOpType = "RegisterOperand"
+    ImmOpType = "ImmediateOperand"
 }};
 
 let {{
-
-    # This function specializes the given piece of code to use a particular
-    # set of argument types described by "opTypes". These are "implemented"
-    # in reverse order.
-    def specializeInst(name, Name, code, opTypes):
-        opNum = len(opTypes) - 1
-        while len(opTypes):
-            # print "Building a composite op with tags", opTypes
-            # print "And code", code
-            opNum = len(opTypes) - 1
-            # A regular expression to find the operand placeholders we're
-            # interested in.
-            opRe = re.compile("\\^(?P<operandNum>%d)(?=[^0-9]|$)" % opNum)
-
-            # Parse the operand type strign we're working with
-            opType = OpType(opTypes[opNum])
-
-            if opType.reg:
-                #Figure out what to do with fixed register operands
-                if opType.reg in ("Ax", "Bx", "Cx", "Dx"):
-                    code = opRe.sub("%%{INTREG_R%s}" % opType.reg.upper(), code)
-                elif opType.reg == "Al":
-                    # We need a way to specify register width
-                    code = opRe.sub("%{INTREG_RAX}", code)
-                else:
-                    print "Didn't know how to encode fixed register %s!" % opType.reg
-            elif opType.tag == None or opType.size == None:
-                raise Exception, "Problem parsing operand tag: %s" % opType.tag
-            elif opType.tag in ("C", "D", "G", "P", "S", "T", "V"):
-                # Use the "reg" field of the ModRM byte to select the register
-                code = opRe.sub("%{(uint8_t)MODRM_REG}", code)
-            elif opType.tag in ("E", "Q", "W"):
-                # This might refer to memory or to a register. We need to
-                # divide it up farther.
-                regCode = opRe.sub("%{(uint8_t)MODRM_RM}", code)
-                regTypes = copy.copy(opTypes)
-                regTypes.pop(-1)
-                # This needs to refer to memory, but we'll fill in the details
-                # later. It needs to take into account unaligned memory
-                # addresses.
-                memCode = opRe.sub("%0", code)
-                memTypes = copy.copy(opTypes)
-                memTypes.pop(-1)
-                return doSplitDecode(name, Name, specializeInst, "MODRM_MOD",
-                    {"3" : (regCode, regTypes)}, (memCode, memTypes))
-            elif opType.tag in ("I", "J"):
-                # Immediates are already in the instruction, so don't leave in
-                # those parameters
-                code = opRe.sub("${IMMEDIATE}", code)
-            elif opType.tag == "M":
-                # This needs to refer to memory, but we'll fill in the details
-                # later. It needs to take into account unaligned memory
-                # addresses.
-                code = opRe.sub("%0", code)
-            elif opType.tag in ("PR", "R", "VR"):
-                # There should probably be a check here to verify that mod
-                # is equal to 11b
-                code = opRe.sub("%{(uint8_t)MODRM_RM}", code)
-            else:
-                raise Exception, "Unrecognized tag %s." % opType.tag
-            opTypes.pop(-1)
-
-        # At this point, we've built up "code" to have all the necessary extra
-        # instructions needed to implement whatever types of operands were
-        # specified. Now we'll assemble it it into a microOp sequence.
-        ops = assembleMicro(code)
-
-        # Build a macroop to contain the sequence of microops we've
-        # constructed. The decode block will be used to fill in our
-        # inner decode structure, and the rest will be concatenated and
-        # passed back.
-        return genInst(name, Name, ops)
-}};
-
-////////////////////////////////////////////////////////////////////
-//
-//  The microcode assembler
-//
-
-let {{
     class MicroOpStatement(object):
         def __init__(self):
             self.className = ''
@@ -242,19 +107,9 @@ let {{
             return 'new %s%s(machInst%s%s)' % (self.className, signature, self.microFlagsText(microFlags), args)
 }};
 
-let {{
-    def buildLabelDict(ops):
-        labels = {}
-        micropc = 0
-        for op in ops:
-            if op.label:
-                labels[op.label] = count
-            micropc += 1
-        return labels
-}};
-
 let{{
-    def assembleMicro(code):
+    def assembleMicro(name, Name, code):
+
         # This function takes in a block of microcode assembly and returns
         # a python list of objects which describe it.
 
@@ -341,7 +196,13 @@ let{{
             lineMatch = lineRe.search(code)
 
         # Decode the labels into displacements
-        labels = buildLabelDict(statements)
+
+        labels = {}
+        micropc = 0
+        for statement in statements:
+            if statement.label:
+                labels[statement.label] = count
+            micropc += 1
         micropc = 0
         for statement in statements:
             for arg in statement.args:
@@ -353,5 +214,15 @@ let{{
                     # micropc + 1 + displacement.
                     arg["operandImm"] = labels[arg["operandLabel"]] - micropc - 1
             micropc += 1
-        return statements
+
+        # If we can implement this instruction with exactly one microop, just
+        # use that directly.
+        if len(statements) == 1:
+            decode_block = "return %s;" % \
+                            statements[0].getAllocator()
+            return ('', '', decode_block, '')
+        else:
+            # Build a macroop to contain the sequence of microops we've
+            # been given.
+            return genMacroOp(name, Name, statements)
 }};
diff --git a/src/arch/x86/isa/microops/base.isa b/src/arch/x86/isa/microops/base.isa
index b1351d999..4254994f3 100644
--- a/src/arch/x86/isa/microops/base.isa
+++ b/src/arch/x86/isa/microops/base.isa
@@ -63,12 +63,15 @@ output header {{
     };
 }};
 
-//A class which is the base of all x86 micro ops it provides a function to
+//A class which is the base of all x86 micro ops. It provides a function to
 //set necessary flags appropriately.
 output header {{
     class X86MicroOpBase : public X86StaticInst
     {
       protected:
+        uint8_t opSize;
+        uint8_t addrSize;
+
         X86MicroOpBase(bool isMicro, bool isDelayed,
                 bool isFirst, bool isLast,
                 const char *mnem, ExtMachInst _machInst,
@@ -94,6 +97,7 @@ def template BaseMicroOpTemplateDeclare {{
 
 let {{
     def buildBaseMicroOpTemplate(Name, numParams):
+        assert(numParams > 0)
         signature = "<"
         signature += "int SignatureOperandTypeSpecifier0"
         for count in xrange(1,numParams):
@@ -102,10 +106,9 @@ let {{
         signature += ">"
         subs = {"signature" : signature, "class_name" : Name}
         return BaseMicroOpTemplateDeclare.subst(subs)
+}};
 
-    RegOpType = "RegisterOperand"
-    ImmOpType = "ImmediateOperand"
-
+let {{
     def buildMicroOpTemplateDict(*params):
         signature = "<"
         if len(params):
diff --git a/src/arch/x86/isa/specialize.isa b/src/arch/x86/isa/specialize.isa
new file mode 100644
index 000000000..9cac09770
--- /dev/null
+++ b/src/arch/x86/isa/specialize.isa
@@ -0,0 +1,172 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2007 The Hewlett-Packard Development Company
+// All rights reserved.
+//
+// Redistribution and use of this software in source and binary forms,
+// with or without modification, are permitted provided that the
+// following conditions are met:
+//
+// The software must be used only for Non-Commercial Use which means any
+// use which is NOT directed to receiving any direct monetary
+// compensation for, or commercial advantage from such use.  Illustrative
+// examples of non-commercial use are academic research, personal study,
+// teaching, education and corporate research & development.
+// Illustrative examples of commercial use are distributing products for
+// commercial advantage and providing services using the software for
+// commercial advantage.
+//
+// If you wish to use this software or functionality therein that may be
+// covered by patents for commercial use, please contact:
+//     Director of Intellectual Property Licensing
+//     Office of Strategy and Technology
+//     Hewlett-Packard Company
+//     1501 Page Mill Road
+//     Palo Alto, California  94304
+//
+// Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.  Redistributions
+// in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or
+// other materials provided with the distribution.  Neither the name of
+// the COPYRIGHT HOLDER(s), HEWLETT-PACKARD COMPANY, nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.  No right of
+// sublicense is granted herewith.  Derivatives of the software and
+// output created using the software may be prepared, but only for
+// Non-Commercial Uses.  Derivatives of the software may be shared with
+// others provided: (i) the others agree to abide by the list of
+// conditions herein which includes the Non-Commercial Use restrictions;
+// and (ii) such Derivatives of the software include the above copyright
+// notice to acknowledge the contribution from this software where
+// applicable, this list of conditions and the disclaimer below.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Gabe Black
+
+////////////////////////////////////////////////////////////////////
+//
+//  Code to "specialize" a microcode sequence to use a particular
+//  variety of operands
+//
+
+let {{
+    # This code builds up a decode block which decodes based on switchval.
+    # vals is a dict which matches case values with what should be decoded to.
+    # builder is called on the exploded contents of "vals" values to generate
+    # whatever code should be used.
+    def doSplitDecode(name, Name, builder, switchVal, vals, default = None):
+        header_output = ''
+        decoder_output = ''
+        decode_block = 'switch(%s) {\n' % switchVal
+        exec_output = ''
+        for (val, todo) in vals.items():
+            (new_header_output,
+             new_decoder_output,
+             new_decode_block,
+             new_exec_output) = builder(name, Name, *todo)
+            header_output += new_header_output
+            decoder_output += new_decoder_output
+            decode_block += '\tcase %s: %s\n' % (val, new_decode_block)
+            exec_output += new_exec_output
+        if default:
+            (new_header_output,
+             new_decoder_output,
+             new_decode_block,
+             new_exec_output) = builder(name, Name, *default)
+            header_output += new_header_output
+            decoder_output += new_decoder_output
+            decode_block += '\tdefault: %s\n' % new_decode_block
+            exec_output += new_exec_output
+        decode_block += '}\n'
+        return (header_output, decoder_output, decode_block, exec_output)
+}};
+
+let {{
+    class OpType(object):
+        parser = re.compile(r"(?P<tag>[A-Z][A-Z]*)(?P<size>[a-z][a-z]*)|(r(?P<reg>[A-Za-z0-9][A-Za-z0-9]*))")
+        def __init__(self, opTypeString):
+            match = OpType.parser.search(opTypeString)
+            if match == None:
+                raise Exception, "Problem parsing operand type %s" % opTypeString
+            self.reg = match.group("reg")
+            self.tag = match.group("tag")
+            self.size = match.group("size")
+
+    # This function specializes the given piece of code to use a particular
+    # set of argument types described by "opTypes". These are "implemented"
+    # in reverse order.
+    def specializeInst(name, Name, code, opTypes):
+        opNum = len(opTypes) - 1
+        while len(opTypes):
+            # print "Building a composite op with tags", opTypes
+            # print "And code", code
+            opNum = len(opTypes) - 1
+            # A regular expression to find the operand placeholders we're
+            # interested in.
+            opRe = re.compile("\\^(?P<operandNum>%d)(?=[^0-9]|$)" % opNum)
+
+            # Parse the operand type strign we're working with
+            opType = OpType(opTypes[opNum])
+
+            if opType.reg:
+                #Figure out what to do with fixed register operands
+                if opType.reg in ("Ax", "Bx", "Cx", "Dx"):
+                    code = opRe.sub("%%{INTREG_R%s}" % opType.reg.upper(), code)
+                elif opType.reg == "Al":
+                    # We need a way to specify register width
+                    code = opRe.sub("%{INTREG_RAX}", code)
+                else:
+                    print "Didn't know how to encode fixed register %s!" % opType.reg
+            elif opType.tag == None or opType.size == None:
+                raise Exception, "Problem parsing operand tag: %s" % opType.tag
+            elif opType.tag in ("C", "D", "G", "P", "S", "T", "V"):
+                # Use the "reg" field of the ModRM byte to select the register
+                code = opRe.sub("%{(uint8_t)MODRM_REG}", code)
+            elif opType.tag in ("E", "Q", "W"):
+                # This might refer to memory or to a register. We need to
+                # divide it up farther.
+                regCode = opRe.sub("%{(uint8_t)MODRM_RM}", code)
+                regTypes = copy.copy(opTypes)
+                regTypes.pop(-1)
+                # This needs to refer to memory, but we'll fill in the details
+                # later. It needs to take into account unaligned memory
+                # addresses.
+                memCode = opRe.sub("%0", code)
+                memTypes = copy.copy(opTypes)
+                memTypes.pop(-1)
+                return doSplitDecode(name, Name, specializeInst, "MODRM_MOD",
+                    {"3" : (regCode, regTypes)}, (memCode, memTypes))
+            elif opType.tag in ("I", "J"):
+                # Immediates are already in the instruction, so don't leave in
+                # those parameters
+                code = opRe.sub("${IMMEDIATE}", code)
+            elif opType.tag == "M":
+                # This needs to refer to memory, but we'll fill in the details
+                # later. It needs to take into account unaligned memory
+                # addresses.
+                code = opRe.sub("%0", code)
+            elif opType.tag in ("PR", "R", "VR"):
+                # There should probably be a check here to verify that mod
+                # is equal to 11b
+                code = opRe.sub("%{(uint8_t)MODRM_RM}", code)
+            else:
+                raise Exception, "Unrecognized tag %s." % opType.tag
+            opTypes.pop(-1)
+
+        # At this point, we've built up "code" to have all the necessary extra
+        # instructions needed to implement whatever types of operands were
+        # specified. Now we'll assemble it it into a StaticInst.
+        return assembleMicro(name, Name, code)
+}};
diff --git a/src/arch/x86/predecoder.cc b/src/arch/x86/predecoder.cc
index 80971e7cf..573012ee6 100644
--- a/src/arch/x86/predecoder.cc
+++ b/src/arch/x86/predecoder.cc
@@ -117,37 +117,33 @@ namespace X86ISA
             //Operand size override prefixes
           case OperandSizeOverride:
             DPRINTF(Predecoder, "Found operand size override prefix.\n");
+            emi.legacy.op = true;
             break;
           case AddressSizeOverride:
             DPRINTF(Predecoder, "Found address size override prefix.\n");
+            emi.legacy.addr = true;
             break;
             //Segment override prefixes
           case CSOverride:
-            DPRINTF(Predecoder, "Found cs segment override.\n");
-            break;
           case DSOverride:
-            DPRINTF(Predecoder, "Found ds segment override.\n");
-            break;
           case ESOverride:
-            DPRINTF(Predecoder, "Found es segment override.\n");
-            break;
           case FSOverride:
-            DPRINTF(Predecoder, "Found fs segment override.\n");
-            break;
           case GSOverride:
-            DPRINTF(Predecoder, "Found gs segment override.\n");
-            break;
           case SSOverride:
-            DPRINTF(Predecoder, "Found ss segment override.\n");
+            DPRINTF(Predecoder, "Found segment override.\n");
+            emi.legacy.seg = prefix;
             break;
           case Lock:
             DPRINTF(Predecoder, "Found lock prefix.\n");
+            emi.legacy.lock = true;
             break;
           case Rep:
             DPRINTF(Predecoder, "Found rep prefix.\n");
+            emi.legacy.rep = true;
             break;
           case Repne:
             DPRINTF(Predecoder, "Found repne prefix.\n");
+            emi.legacy.repne = true;
             break;
           case RexPrefix:
             DPRINTF(Predecoder, "Found Rex prefix %#x.\n", nextByte);
@@ -198,16 +194,36 @@ namespace X86ISA
             displacementCollected = 0;
             emi.displacement = 0;
 
+            //Figure out the effective operand size. This can be overriden to
+            //a fixed value at the decoder level.
+            if(/*FIXME long mode*/1)
+            {
+                if(emi.rex && emi.rex.w)
+                    emi.opSize = 3; // 64 bit operand size
+                else if(emi.legacy.op)
+                    emi.opSize = 1; // 16 bit operand size
+                else
+                    emi.opSize = 2; // 32 bit operand size
+            }
+            else if(/*FIXME default 32*/1)
+            {
+                if(emi.legacy.op)
+                    emi.opSize = 1; // 16 bit operand size
+                else
+                    emi.opSize = 2; // 32 bit operand size
+            }
+            else // 16 bit default operand size
+            {
+                if(emi.legacy.op)
+                    emi.opSize = 2; // 32 bit operand size
+                else
+                    emi.opSize = 1; // 16 bit operand size
+            }
+
             //Figure out how big of an immediate we'll retreive based
             //on the opcode.
-            int immType = ImmediateType[
-                emi.opcode.num - 1][nextByte];
-            if(0) //16 bit mode
-                immediateSize = ImmediateTypeToSize[0][immType];
-            else if(!(emi.rex & 0x4)) //32 bit mode
-                immediateSize = ImmediateTypeToSize[1][immType];
-            else //64 bit mode
-                immediateSize = ImmediateTypeToSize[2][immType];
+            int immType = ImmediateType[emi.opcode.num - 1][nextByte];
+            immediateSize = SizeTypeToSize[emi.opSize - 1][immType];
 
             //Determine what to expect next
             if (UsesModRM[emi.opcode.num - 1][nextByte]) {
@@ -351,6 +367,16 @@ namespace X86ISA
 
         if(immediateSize == immediateCollected)
         {
+            //XXX Warning! The following is an observed pattern and might
+            //not always be true!
+
+            //Instructions which use 64 bit operands but 32 bit immediates
+            //need to have the immediate sign extended to 64 bits.
+            //Instructions which use true 64 bit immediates won't be
+            //affected, and instructions that use true 32 bit immediates
+            //won't notice.
+            if(immediateSize == 4)
+                emi.immediate = sext<32>(emi.immediate);
             DPRINTF(Predecoder, "Collected immediate %#x.\n",
                     emi.immediate);
             emiIsReady = true;
diff --git a/src/arch/x86/predecoder.hh b/src/arch/x86/predecoder.hh
index 1df17d6d2..6562ab9f5 100644
--- a/src/arch/x86/predecoder.hh
+++ b/src/arch/x86/predecoder.hh
@@ -73,7 +73,7 @@ namespace X86ISA
         static const uint8_t Prefixes[256];
         static const uint8_t UsesModRM[2][256];
         static const uint8_t ImmediateType[2][256];
-        static const uint8_t ImmediateTypeToSize[3][10];
+        static const uint8_t SizeTypeToSize[3][10];
 
       protected:
         ThreadContext * tc;
diff --git a/src/arch/x86/predecoder_tables.cc b/src/arch/x86/predecoder_tables.cc
index f233ad234..38b9c57a3 100644
--- a/src/arch/x86/predecoder_tables.cc
+++ b/src/arch/x86/predecoder_tables.cc
@@ -141,7 +141,7 @@ namespace X86ISA
         }
     };
 
-    enum ImmediateTypes {
+    enum SizeType {
         NoImm,
         NI = NoImm,
         ByteImm,
@@ -158,19 +158,19 @@ namespace X86ISA
         VW = VWordImm,
         ZWordImm,
         ZW = ZWordImm,
-        Pointer,
-        PO = Pointer,
         //The enter instruction takes -2- immediates for a total of 3 bytes
         Enter,
-        EN = Enter
+        EN = Enter,
+        Pointer,
+        PO = Pointer
     };
 
-    const uint8_t Predecoder::ImmediateTypeToSize[3][10] =
+    const uint8_t Predecoder::SizeTypeToSize[3][10] =
     {
-//       noimm byte word dword qword oword vword zword enter
-        {0,    1,   2,   4,    8,    16,   2,    2,    3,    4}, //16 bit
-        {0,    1,   2,   4,    8,    16,   4,    4,    3,    6}, //32 bit
-        {0,    1,   2,   4,    8,    16,   4,    8,    3,    0}  //64 bit
+//       noimm byte word dword qword oword vword zword enter pointer
+        {0,    1,   2,   4,    8,    16,   2,    2,    3,    4      }, //16 bit
+        {0,    1,   2,   4,    8,    16,   4,    4,    3,    6      }, //32 bit
+        {0,    1,   2,   4,    8,    16,   4,    8,    3,    0      }  //64 bit
     };
 
     //This table determines the immediate type. The first index is the
diff --git a/src/arch/x86/types.hh b/src/arch/x86/types.hh
index cdac3c00e..022f20ee5 100644
--- a/src/arch/x86/types.hh
+++ b/src/arch/x86/types.hh
@@ -70,25 +70,31 @@ namespace X86ISA
     typedef uint64_t MachInst;
 
     enum Prefixes {
-        NoOverride = 0,
-        CSOverride = 1,
-        DSOverride = 2,
-        ESOverride = 3,
-        FSOverride = 4,
-        GSOverride = 5,
-        SSOverride = 6,
-        //The Rex prefix obviously doesn't fit in with the above, but putting
-        //it here lets us save double the space the enums take up.
-        RexPrefix = 7,
+        NoOverride,
+        CSOverride,
+        DSOverride,
+        ESOverride,
+        FSOverride,
+        GSOverride,
+        SSOverride,
+        RexPrefix,
+        OperandSizeOverride,
+        AddressSizeOverride,
+        Lock,
+        Rep,
+        Repne
+    };
+
+    BitUnion8(LegacyPrefixVector)
+        Bitfield<7> repne;
+        Bitfield<6> rep;
+        Bitfield<5> lock;
+        Bitfield<4> addr;
+        Bitfield<3> op;
         //There can be only one segment override, so they share the
         //first 3 bits in the legacyPrefixes bitfield.
-        SegmentOverride = 0x7,
-        OperandSizeOverride = 8,
-        AddressSizeOverride = 16,
-        Lock = 32,
-        Rep = 64,
-        Repne = 128
-    };
+        Bitfield<2,0> seg;
+    EndBitUnion(LegacyPrefixVector)
 
     BitUnion8(ModRM)
         Bitfield<7,6> mod;
@@ -118,7 +124,7 @@ namespace X86ISA
     struct ExtMachInst
     {
         //Prefixes
-        uint8_t legacy;
+        LegacyPrefixVector legacy;
         Rex rex;
         //This holds all of the bytes of the opcode
         struct
@@ -140,6 +146,10 @@ namespace X86ISA
         //Immediate fields
         uint64_t immediate;
         uint64_t displacement;
+
+        //The effective operand size.
+        uint8_t opSize;
+        //The
     };
 
     inline static std::ostream &
diff --git a/src/arch/x86/utility.hh b/src/arch/x86/utility.hh
index e0bd09515..1c98e7fbc 100644
--- a/src/arch/x86/utility.hh
+++ b/src/arch/x86/utility.hh
@@ -78,7 +78,8 @@ namespace __hash_namespace {
                     ((uint64_t)emi.opcode.prefixA << 16) |
                     ((uint64_t)emi.opcode.prefixB << 8) |
                     ((uint64_t)emi.opcode.op)) ^
-                    emi.immediate ^ emi.displacement;
+                    emi.immediate ^ emi.displacement ^
+                    emi.opSize;
         };
     };
 }
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index 6c6d90076..eed05c2f1 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -877,6 +877,11 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
         effAddrValid = true;
         physEffAddr = req->getPaddr();
         memReqFlags = req->getFlags();
+
+        if (req->isCondSwap()) {
+            assert(res);
+            req->setExtraData(*res);
+        }
 #if 0
         if (cpu->system->memctrl->badaddr(physEffAddr)) {
             fault = TheISA::genMachineCheckFault();
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index e1b27048d..f24de20d9 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -289,15 +289,19 @@ class LSQUnit {
     struct SQEntry {
         /** Constructs an empty store queue entry. */
         SQEntry()
-            : inst(NULL), req(NULL), size(0), data(0),
+            : inst(NULL), req(NULL), size(0),
               canWB(0), committed(0), completed(0)
-        { }
+        {
+            bzero(data, sizeof(data));
+        }
 
         /** Constructs a store queue entry for a given instruction. */
         SQEntry(DynInstPtr &_inst)
-            : inst(_inst), req(NULL), size(0), data(0),
+            : inst(_inst), req(NULL), size(0),
               canWB(0), committed(0), completed(0)
-        { }
+        {
+            bzero(data, sizeof(data));
+        }
 
         /** The store instruction. */
         DynInstPtr inst;
@@ -306,7 +310,7 @@ class LSQUnit {
         /** The size of the store. */
         int size;
         /** The store data. */
-        IntReg data;
+        char data[sizeof(IntReg)];
         /** Whether or not the store can writeback. */
         bool canWB;
         /** Whether or not the store is committed. */
@@ -554,22 +558,14 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
         if ((store_has_lower_limit && store_has_upper_limit)) {
             // Get shift amount for offset into the store's data.
             int shift_amt = req->getVaddr() & (store_size - 1);
-            // @todo: Magic number, assumes byte addressing
-            shift_amt = shift_amt << 3;
-
-            // Cast this to type T?
-            data = storeQueue[store_idx].data >> shift_amt;
 
-            // When the data comes from the store queue entry, it's in host
-            // order. When it gets sent to the load, it needs to be in guest
-            // order so when the load converts it again, it ends up back
-            // in host order like the inst expects.
-            data = TheISA::htog(data);
+            memcpy(&data, storeQueue[store_idx].data + shift_amt, sizeof(T));
 
             assert(!load_inst->memData);
             load_inst->memData = new uint8_t[64];
 
-            memcpy(load_inst->memData, &data, req->getSize());
+            memcpy(load_inst->memData,
+                    storeQueue[store_idx].data + shift_amt, req->getSize());
 
             DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
                     "addr %#x, data %#x\n",
@@ -716,7 +712,10 @@ LSQUnit<Impl>::write(Request *req, T &data, int store_idx)
 
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
-    storeQueue[store_idx].data = data;
+    assert(sizeof(T) <= sizeof(storeQueue[store_idx].data));
+
+    T gData = htog(data);
+    memcpy(storeQueue[store_idx].data, &gData, sizeof(T));
 
     // This function only writes the data to the store queue, so no fault
     // can happen here.
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 2aa0d6b6a..44e2cea76 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -645,22 +645,10 @@ LSQUnit<Impl>::writebackStores()
         assert(!inst->memData);
         inst->memData = new uint8_t[64];
 
-        TheISA::IntReg convertedData =
-            TheISA::htog(storeQueue[storeWBIdx].data);
-
-        //FIXME This is a hack to get SPARC working. It, along with endianness
-        //in the memory system in general, need to be straightened out more
-        //formally. The problem is that the data's endianness is swapped when
-        //it's in the 64 bit data field in the store queue. The data that you
-        //want won't start at the beginning of the field anymore unless it was
-        //a 64 bit access.
-        memcpy(inst->memData,
-                (uint8_t *)&convertedData +
-                (TheISA::ByteOrderDiffers ?
-                 (sizeof(TheISA::IntReg) - req->getSize()) : 0),
-                req->getSize());
-
-        PacketPtr data_pkt = new Packet(req, MemCmd::WriteReq,
+        memcpy(inst->memData, storeQueue[storeWBIdx].data, req->getSize());
+
+        MemCmd command = req->isSwap() ? MemCmd::SwapReq : MemCmd::WriteReq;
+        PacketPtr data_pkt = new Packet(req, command,
                                         Packet::Broadcast);
         data_pkt->dataStatic(inst->memData);
 
@@ -677,7 +665,7 @@ LSQUnit<Impl>::writebackStores()
                 inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
-        if (req->isLocked()) {
+        if (inst->isStoreConditional()) {
             // Disable recording the result temporarily.  Writing to
             // misc regs normally updates the result, but this is not
             // the desired behavior when handling store conditionals.
diff --git a/src/cpu/o3/regfile.hh b/src/cpu/o3/regfile.hh
index bbc69fc96..b5b1cd021 100644
--- a/src/cpu/o3/regfile.hh
+++ b/src/cpu/o3/regfile.hh
@@ -174,7 +174,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
+        assert(reg_idx < numPhysicalFloatRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %#x\n",
                 int(reg_idx), (uint64_t)val);
@@ -189,7 +189,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
+        assert(reg_idx < numPhysicalFloatRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %#x\n",
                 int(reg_idx), (uint64_t)val);
@@ -204,7 +204,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
+        assert(reg_idx < numPhysicalFloatRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %#x\n",
                 int(reg_idx), (uint64_t)val);
@@ -217,7 +217,7 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);
+        assert(reg_idx < numPhysicalFloatRegs);
 
         DPRINTF(IEW, "RegFile: Setting float register %i to %#x\n",
                 int(reg_idx), (uint64_t)val);
@@ -232,11 +232,11 @@ class PhysRegFile
 
     MiscReg readMiscReg(int misc_reg, unsigned thread_id)
     {
-        return miscRegs[thread_id].readReg(misc_reg,
-                                                     cpu->tcBase(thread_id));
+        return miscRegs[thread_id].readReg(misc_reg, cpu->tcBase(thread_id));
     }
 
-    void setMiscRegNoEffect(int misc_reg, const MiscReg &val, unsigned thread_id)
+    void setMiscRegNoEffect(int misc_reg,
+            const MiscReg &val, unsigned thread_id)
     {
         miscRegs[thread_id].setRegNoEffect(misc_reg, val);
     }
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index ec630b31e..431705e19 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -996,7 +996,12 @@ DefaultRename<Impl>::renameSrcRegs(DynInstPtr &inst,unsigned tid)
         if (src_reg < TheISA::FP_Base_DepTag) {
             flat_src_reg = TheISA::flattenIntIndex(inst->tcBase(), src_reg);
             DPRINTF(Rename, "Flattening index %d to %d.\n", (int)src_reg, (int)flat_src_reg);
+        } else {
+            // Floating point and Miscellaneous registers need their indexes
+            // adjusted to account for the expanded number of flattened int regs.
+            flat_src_reg = src_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs;
         }
+
         inst->flattenSrcReg(src_idx, flat_src_reg);
 
         // Look up the source registers to get the phys. register they've
@@ -1033,8 +1038,13 @@ DefaultRename<Impl>::renameDestRegs(DynInstPtr &inst,unsigned tid)
         RegIndex dest_reg = inst->destRegIdx(dest_idx);
         RegIndex flat_dest_reg = dest_reg;
         if (dest_reg < TheISA::FP_Base_DepTag) {
+            // Integer registers are flattened.
             flat_dest_reg = TheISA::flattenIntIndex(inst->tcBase(), dest_reg);
             DPRINTF(Rename, "Flattening index %d to %d.\n", (int)dest_reg, (int)flat_dest_reg);
+        } else {
+            // Floating point and Miscellaneous registers need their indexes
+            // adjusted to account for the expanded number of flattened int regs.
+            flat_dest_reg = dest_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs;
         }
 
         inst->flattenDestReg(dest_idx, flat_dest_reg);
diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.ini b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.ini
new file mode 100644
index 000000000..f804a40fe
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.ini
@@ -0,0 +1,379 @@
+[root]
+type=Root
+children=system
+dummy=0
+
+[system]
+type=System
+children=cpu membus physmem
+mem_mode=atomic
+physmem=system.physmem
+
+[system.cpu]
+type=DerivO3CPU
+children=dcache fuPool icache l2cache toL2Bus workload
+BTBEntries=4096
+BTBTagSize=16
+LFSTSize=1024
+LQEntries=32
+RASSize=16
+SQEntries=32
+SSITSize=1024
+activity=0
+backComSize=5
+choiceCtrBits=2
+choicePredictorSize=8192
+clock=1
+commitToDecodeDelay=1
+commitToFetchDelay=1
+commitToIEWDelay=1
+commitToRenameDelay=1
+commitWidth=8
+cpu_id=0
+decodeToFetchDelay=1
+decodeToRenameDelay=1
+decodeWidth=8
+defer_registration=false
+dispatchWidth=8
+fetchToDecodeDelay=1
+fetchTrapLatency=1
+fetchWidth=8
+forwardComSize=5
+fuPool=system.cpu.fuPool
+function_trace=false
+function_trace_start=0
+globalCtrBits=2
+globalHistoryBits=13
+globalPredictorSize=8192
+iewToCommitDelay=1
+iewToDecodeDelay=1
+iewToFetchDelay=1
+iewToRenameDelay=1
+instShiftAmt=2
+issueToExecuteDelay=1
+issueWidth=8
+localCtrBits=2
+localHistoryBits=11
+localHistoryTableSize=2048
+localPredictorSize=2048
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+numIQEntries=64
+numPhysFloatRegs=256
+numPhysIntRegs=256
+numROBEntries=192
+numRobs=1
+numThreads=1
+phase=0
+predType=tournament
+progress_interval=0
+renameToDecodeDelay=1
+renameToFetchDelay=1
+renameToIEWDelay=2
+renameToROBDelay=1
+renameWidth=8
+squashWidth=8
+system=system
+trapLatency=13
+wbDepth=1
+wbWidth=8
+workload=system.cpu.workload
+dcache_port=system.cpu.dcache.cpu_side
+icache_port=system.cpu.icache.cpu_side
+
+[system.cpu.dcache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=262144
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=20
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.dcache_port
+mem_side=system.cpu.toL2Bus.port[1]
+
+[system.cpu.fuPool]
+type=FUPool
+children=FUList0 FUList1 FUList2 FUList3 FUList4 FUList5 FUList6 FUList7
+FUList=system.cpu.fuPool.FUList0 system.cpu.fuPool.FUList1 system.cpu.fuPool.FUList2 system.cpu.fuPool.FUList3 system.cpu.fuPool.FUList4 system.cpu.fuPool.FUList5 system.cpu.fuPool.FUList6 system.cpu.fuPool.FUList7
+
+[system.cpu.fuPool.FUList0]
+type=FUDesc
+children=opList0
+count=6
+opList=system.cpu.fuPool.FUList0.opList0
+
+[system.cpu.fuPool.FUList0.opList0]
+type=OpDesc
+issueLat=1
+opClass=IntAlu
+opLat=1
+
+[system.cpu.fuPool.FUList1]
+type=FUDesc
+children=opList0 opList1
+count=2
+opList=system.cpu.fuPool.FUList1.opList0 system.cpu.fuPool.FUList1.opList1
+
+[system.cpu.fuPool.FUList1.opList0]
+type=OpDesc
+issueLat=1
+opClass=IntMult
+opLat=3
+
+[system.cpu.fuPool.FUList1.opList1]
+type=OpDesc
+issueLat=19
+opClass=IntDiv
+opLat=20
+
+[system.cpu.fuPool.FUList2]
+type=FUDesc
+children=opList0 opList1 opList2
+count=4
+opList=system.cpu.fuPool.FUList2.opList0 system.cpu.fuPool.FUList2.opList1 system.cpu.fuPool.FUList2.opList2
+
+[system.cpu.fuPool.FUList2.opList0]
+type=OpDesc
+issueLat=1
+opClass=FloatAdd
+opLat=2
+
+[system.cpu.fuPool.FUList2.opList1]
+type=OpDesc
+issueLat=1
+opClass=FloatCmp
+opLat=2
+
+[system.cpu.fuPool.FUList2.opList2]
+type=OpDesc
+issueLat=1
+opClass=FloatCvt
+opLat=2
+
+[system.cpu.fuPool.FUList3]
+type=FUDesc
+children=opList0 opList1 opList2
+count=2
+opList=system.cpu.fuPool.FUList3.opList0 system.cpu.fuPool.FUList3.opList1 system.cpu.fuPool.FUList3.opList2
+
+[system.cpu.fuPool.FUList3.opList0]
+type=OpDesc
+issueLat=1
+opClass=FloatMult
+opLat=4
+
+[system.cpu.fuPool.FUList3.opList1]
+type=OpDesc
+issueLat=12
+opClass=FloatDiv
+opLat=12
+
+[system.cpu.fuPool.FUList3.opList2]
+type=OpDesc
+issueLat=24
+opClass=FloatSqrt
+opLat=24
+
+[system.cpu.fuPool.FUList4]
+type=FUDesc
+children=opList0
+count=0
+opList=system.cpu.fuPool.FUList4.opList0
+
+[system.cpu.fuPool.FUList4.opList0]
+type=OpDesc
+issueLat=1
+opClass=MemRead
+opLat=1
+
+[system.cpu.fuPool.FUList5]
+type=FUDesc
+children=opList0
+count=0
+opList=system.cpu.fuPool.FUList5.opList0
+
+[system.cpu.fuPool.FUList5.opList0]
+type=OpDesc
+issueLat=1
+opClass=MemWrite
+opLat=1
+
+[system.cpu.fuPool.FUList6]
+type=FUDesc
+children=opList0 opList1
+count=4
+opList=system.cpu.fuPool.FUList6.opList0 system.cpu.fuPool.FUList6.opList1
+
+[system.cpu.fuPool.FUList6.opList0]
+type=OpDesc
+issueLat=1
+opClass=MemRead
+opLat=1
+
+[system.cpu.fuPool.FUList6.opList1]
+type=OpDesc
+issueLat=1
+opClass=MemWrite
+opLat=1
+
+[system.cpu.fuPool.FUList7]
+type=FUDesc
+children=opList0
+count=1
+opList=system.cpu.fuPool.FUList7.opList0
+
+[system.cpu.fuPool.FUList7.opList0]
+type=OpDesc
+issueLat=3
+opClass=IprAccess
+opLat=3
+
+[system.cpu.icache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=131072
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=20
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.icache_port
+mem_side=system.cpu.toL2Bus.port[0]
+
+[system.cpu.l2cache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=2097152
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=5
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.toL2Bus.port[2]
+mem_side=system.membus.port[1]
+
+[system.cpu.toL2Bus]
+type=Bus
+bus_id=0
+clock=1000
+responder_set=false
+width=64
+port=system.cpu.icache.mem_side system.cpu.dcache.mem_side system.cpu.l2cache.cpu_side
+
+[system.cpu.workload]
+type=LiveProcess
+cmd=insttest
+cwd=
+egid=100
+env=
+euid=100
+executable=tests/test-progs/insttest/bin/sparc/linux/insttest
+gid=100
+input=cin
+output=cout
+pid=100
+ppid=99
+system=system
+uid=100
+
+[system.membus]
+type=Bus
+bus_id=0
+clock=1000
+responder_set=false
+width=64
+port=system.physmem.port system.cpu.l2cache.mem_side
+
+[system.physmem]
+type=PhysicalMemory
+file=
+latency=1
+range=0:134217727
+zero=false
+port=system.membus.port[0]
+
diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.out b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.out
new file mode 100644
index 000000000..d248f77bf
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/config.out
@@ -0,0 +1,367 @@
+[root]
+type=Root
+dummy=0
+
+[system.physmem]
+type=PhysicalMemory
+file=
+range=[0,134217727]
+latency=1
+zero=false
+
+[system]
+type=System
+physmem=system.physmem
+mem_mode=atomic
+
+[system.membus]
+type=Bus
+bus_id=0
+clock=1000
+width=64
+responder_set=false
+
+[system.cpu.workload]
+type=LiveProcess
+cmd=insttest
+executable=tests/test-progs/insttest/bin/sparc/linux/insttest
+input=cin
+output=cout
+env=
+cwd=
+system=system
+uid=100
+euid=100
+gid=100
+egid=100
+pid=100
+ppid=99
+
+[system.cpu.fuPool.FUList0.opList0]
+type=OpDesc
+opClass=IntAlu
+opLat=1
+issueLat=1
+
+[system.cpu.fuPool.FUList0]
+type=FUDesc
+opList=system.cpu.fuPool.FUList0.opList0
+count=6
+
+[system.cpu.fuPool.FUList1.opList0]
+type=OpDesc
+opClass=IntMult
+opLat=3
+issueLat=1
+
+[system.cpu.fuPool.FUList1.opList1]
+type=OpDesc
+opClass=IntDiv
+opLat=20
+issueLat=19
+
+[system.cpu.fuPool.FUList1]
+type=FUDesc
+opList=system.cpu.fuPool.FUList1.opList0 system.cpu.fuPool.FUList1.opList1
+count=2
+
+[system.cpu.fuPool.FUList2.opList0]
+type=OpDesc
+opClass=FloatAdd
+opLat=2
+issueLat=1
+
+[system.cpu.fuPool.FUList2.opList1]
+type=OpDesc
+opClass=FloatCmp
+opLat=2
+issueLat=1
+
+[system.cpu.fuPool.FUList2.opList2]
+type=OpDesc
+opClass=FloatCvt
+opLat=2
+issueLat=1
+
+[system.cpu.fuPool.FUList2]
+type=FUDesc
+opList=system.cpu.fuPool.FUList2.opList0 system.cpu.fuPool.FUList2.opList1 system.cpu.fuPool.FUList2.opList2
+count=4
+
+[system.cpu.fuPool.FUList3.opList0]
+type=OpDesc
+opClass=FloatMult
+opLat=4
+issueLat=1
+
+[system.cpu.fuPool.FUList3.opList1]
+type=OpDesc
+opClass=FloatDiv
+opLat=12
+issueLat=12
+
+[system.cpu.fuPool.FUList3.opList2]
+type=OpDesc
+opClass=FloatSqrt
+opLat=24
+issueLat=24
+
+[system.cpu.fuPool.FUList3]
+type=FUDesc
+opList=system.cpu.fuPool.FUList3.opList0 system.cpu.fuPool.FUList3.opList1 system.cpu.fuPool.FUList3.opList2
+count=2
+
+[system.cpu.fuPool.FUList4.opList0]
+type=OpDesc
+opClass=MemRead
+opLat=1
+issueLat=1
+
+[system.cpu.fuPool.FUList4]
+type=FUDesc
+opList=system.cpu.fuPool.FUList4.opList0
+count=0
+
+[system.cpu.fuPool.FUList5.opList0]
+type=OpDesc
+opClass=MemWrite
+opLat=1
+issueLat=1
+
+[system.cpu.fuPool.FUList5]
+type=FUDesc
+opList=system.cpu.fuPool.FUList5.opList0
+count=0
+
+[system.cpu.fuPool.FUList6.opList0]
+type=OpDesc
+opClass=MemRead
+opLat=1
+issueLat=1
+
+[system.cpu.fuPool.FUList6.opList1]
+type=OpDesc
+opClass=MemWrite
+opLat=1
+issueLat=1
+
+[system.cpu.fuPool.FUList6]
+type=FUDesc
+opList=system.cpu.fuPool.FUList6.opList0 system.cpu.fuPool.FUList6.opList1
+count=4
+
+[system.cpu.fuPool.FUList7.opList0]
+type=OpDesc
+opClass=IprAccess
+opLat=3
+issueLat=3
+
+[system.cpu.fuPool.FUList7]
+type=FUDesc
+opList=system.cpu.fuPool.FUList7.opList0
+count=1
+
+[system.cpu.fuPool]
+type=FUPool
+FUList=system.cpu.fuPool.FUList0 system.cpu.fuPool.FUList1 system.cpu.fuPool.FUList2 system.cpu.fuPool.FUList3 system.cpu.fuPool.FUList4 system.cpu.fuPool.FUList5 system.cpu.fuPool.FUList6 system.cpu.fuPool.FUList7
+
+[system.cpu]
+type=DerivO3CPU
+clock=1
+phase=0
+numThreads=1
+cpu_id=0
+activity=0
+workload=system.cpu.workload
+checker=null
+max_insts_any_thread=0
+max_insts_all_threads=0
+max_loads_any_thread=0
+max_loads_all_threads=0
+progress_interval=0
+cachePorts=200
+decodeToFetchDelay=1
+renameToFetchDelay=1
+iewToFetchDelay=1
+commitToFetchDelay=1
+fetchWidth=8
+renameToDecodeDelay=1
+iewToDecodeDelay=1
+commitToDecodeDelay=1
+fetchToDecodeDelay=1
+decodeWidth=8
+iewToRenameDelay=1
+commitToRenameDelay=1
+decodeToRenameDelay=1
+renameWidth=8
+commitToIEWDelay=1
+renameToIEWDelay=2
+issueToExecuteDelay=1
+dispatchWidth=8
+issueWidth=8
+wbWidth=8
+wbDepth=1
+fuPool=system.cpu.fuPool
+iewToCommitDelay=1
+renameToROBDelay=1
+commitWidth=8
+squashWidth=8
+trapLatency=13
+backComSize=5
+forwardComSize=5
+predType=tournament
+localPredictorSize=2048
+localCtrBits=2
+localHistoryTableSize=2048
+localHistoryBits=11
+globalPredictorSize=8192
+globalCtrBits=2
+globalHistoryBits=13
+choicePredictorSize=8192
+choiceCtrBits=2
+BTBEntries=4096
+BTBTagSize=16
+RASSize=16
+LQEntries=32
+SQEntries=32
+LFSTSize=1024
+SSITSize=1024
+numPhysIntRegs=256
+numPhysFloatRegs=256
+numIQEntries=64
+numROBEntries=192
+smtNumFetchingThreads=1
+smtFetchPolicy=SingleThread
+smtLSQPolicy=Partitioned
+smtLSQThreshold=100
+smtIQPolicy=Partitioned
+smtIQThreshold=100
+smtROBPolicy=Partitioned
+smtROBThreshold=100
+smtCommitPolicy=RoundRobin
+instShiftAmt=2
+defer_registration=false
+function_trace=false
+function_trace_start=0
+
+[system.cpu.icache]
+type=BaseCache
+size=131072
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=20
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
+[system.cpu.dcache]
+type=BaseCache
+size=262144
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=20
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
+[system.cpu.l2cache]
+type=BaseCache
+size=2097152
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=5
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
+[system.cpu.toL2Bus]
+type=Bus
+bus_id=0
+clock=1000
+width=64
+responder_set=false
+
diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt
new file mode 100644
index 000000000..7c0d31494
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt
@@ -0,0 +1,410 @@
+
+---------- Begin Simulation Statistics ----------
+global.BPredUnit.BTBCorrect                         0                       # Number of correct BTB predictions (this stat may not work properly.
+global.BPredUnit.BTBHits                         2990                       # Number of BTB hits
+global.BPredUnit.BTBLookups                      7055                       # Number of BTB lookups
+global.BPredUnit.RASInCorrect                       0                       # Number of incorrect RAS predictions.
+global.BPredUnit.condIncorrect                   2077                       # Number of conditional branches incorrect
+global.BPredUnit.condPredicted                   7846                       # Number of conditional branches predicted
+global.BPredUnit.lookups                         7846                       # Number of BP lookups
+global.BPredUnit.usedRAS                            0                       # Number of times the RAS was used to get a target.
+host_inst_rate                                  15119                       # Simulator instruction rate (inst/s)
+host_mem_usage                                 154868                       # Number of bytes of host memory used
+host_seconds                                     0.73                       # Real time elapsed on the host
+host_tick_rate                                1956796                       # Simulator tick rate (ticks/s)
+memdepunit.memDep.conflictingLoads                 12                       # Number of conflicting loads.
+memdepunit.memDep.conflictingStores                 0                       # Number of conflicting stores.
+memdepunit.memDep.insertedLoads                  3250                       # Number of loads inserted to the mem dependence unit.
+memdepunit.memDep.insertedStores                 2817                       # Number of stores inserted to the mem dependence unit.
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+sim_insts                                       10976                       # Number of instructions simulated
+sim_seconds                                  0.000001                       # Number of seconds simulated
+sim_ticks                                     1421211                       # Number of ticks simulated
+system.cpu.commit.COM:branches                   2152                       # Number of branches committed
+system.cpu.commit.COM:bw_lim_events               172                       # number cycles where commit BW limit reached
+system.cpu.commit.COM:bw_limited                    0                       # number of insts not committed due to BW limits
+system.cpu.commit.COM:committed_per_cycle.start_dist                     # Number of insts commited each cycle
+system.cpu.commit.COM:committed_per_cycle.samples       221349                      
+system.cpu.commit.COM:committed_per_cycle.min_value            0                      
+                               0       215844   9751.30%           
+                               1         2970    134.18%           
+                               2         1290     58.28%           
+                               3          631     28.51%           
+                               4          208      9.40%           
+                               5           90      4.07%           
+                               6          133      6.01%           
+                               7           11      0.50%           
+                               8          172      7.77%           
+system.cpu.commit.COM:committed_per_cycle.max_value            8                      
+system.cpu.commit.COM:committed_per_cycle.end_dist
+
+system.cpu.commit.COM:count                     10976                       # Number of instructions committed
+system.cpu.commit.COM:loads                      1462                       # Number of loads committed
+system.cpu.commit.COM:membars                       0                       # Number of memory barriers committed
+system.cpu.commit.COM:refs                       2760                       # Number of memory references committed
+system.cpu.commit.COM:swp_count                     0                       # Number of s/w prefetches committed
+system.cpu.commit.branchMispredicts              2077                       # The number of times a branch was mispredicted
+system.cpu.commit.commitCommittedInsts          10976                       # The number of committed instructions
+system.cpu.commit.commitNonSpecStalls             327                       # The number of times commit has been forced to stall to communicate backwards
+system.cpu.commit.commitSquashedInsts           14263                       # The number of squashed insts skipped by commit
+system.cpu.committedInsts                       10976                       # Number of Instructions Simulated
+system.cpu.committedInsts_total                 10976                       # Number of Instructions Simulated
+system.cpu.cpi                             129.483509                       # CPI: Cycles Per Instruction
+system.cpu.cpi_total                       129.483509                       # CPI: Total CPI of All Threads
+system.cpu.dcache.ReadReq_accesses               2737                       # number of ReadReq accesses(hits+misses)
+system.cpu.dcache.ReadReq_avg_miss_latency  6585.044776                       # average ReadReq miss latency
+system.cpu.dcache.ReadReq_avg_mshr_miss_latency  6511.939394                       # average ReadReq mshr miss latency
+system.cpu.dcache.ReadReq_hits                   2603                       # number of ReadReq hits
+system.cpu.dcache.ReadReq_miss_latency         882396                       # number of ReadReq miss cycles
+system.cpu.dcache.ReadReq_miss_rate          0.048959                       # miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_misses                  134                       # number of ReadReq misses
+system.cpu.dcache.ReadReq_mshr_hits                68                       # number of ReadReq MSHR hits
+system.cpu.dcache.ReadReq_mshr_miss_latency       429788                       # number of ReadReq MSHR miss cycles
+system.cpu.dcache.ReadReq_mshr_miss_rate     0.024114                       # mshr miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_mshr_misses              66                       # number of ReadReq MSHR misses
+system.cpu.dcache.SwapReq_accesses                  6                       # number of SwapReq accesses(hits+misses)
+system.cpu.dcache.SwapReq_hits                      6                       # number of SwapReq hits
+system.cpu.dcache.WriteReq_accesses              1292                       # number of WriteReq accesses(hits+misses)
+system.cpu.dcache.WriteReq_avg_miss_latency  7960.583924                       # average WriteReq miss latency
+system.cpu.dcache.WriteReq_avg_mshr_miss_latency  7136.918605                       # average WriteReq mshr miss latency
+system.cpu.dcache.WriteReq_hits                   869                       # number of WriteReq hits
+system.cpu.dcache.WriteReq_miss_latency       3367327                       # number of WriteReq miss cycles
+system.cpu.dcache.WriteReq_miss_rate         0.327399                       # miss rate for WriteReq accesses
+system.cpu.dcache.WriteReq_misses                 423                       # number of WriteReq misses
+system.cpu.dcache.WriteReq_mshr_hits              337                       # number of WriteReq MSHR hits
+system.cpu.dcache.WriteReq_mshr_miss_latency       613775                       # number of WriteReq MSHR miss cycles
+system.cpu.dcache.WriteReq_mshr_miss_rate     0.066563                       # mshr miss rate for WriteReq accesses
+system.cpu.dcache.WriteReq_mshr_misses             86                       # number of WriteReq MSHR misses
+system.cpu.dcache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_refs                  22.881579                       # Average number of references to valid blocks.
+system.cpu.dcache.blocked_no_mshrs                  0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_no_targets                0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.dcache.cache_copies                      0                       # number of cache copies performed
+system.cpu.dcache.demand_accesses                4029                       # number of demand (read+write) accesses
+system.cpu.dcache.demand_avg_miss_latency  7629.664273                       # average overall miss latency
+system.cpu.dcache.demand_avg_mshr_miss_latency  6865.546053                       # average overall mshr miss latency
+system.cpu.dcache.demand_hits                    3472                       # number of demand (read+write) hits
+system.cpu.dcache.demand_miss_latency         4249723                       # number of demand (read+write) miss cycles
+system.cpu.dcache.demand_miss_rate           0.138248                       # miss rate for demand accesses
+system.cpu.dcache.demand_misses                   557                       # number of demand (read+write) misses
+system.cpu.dcache.demand_mshr_hits                405                       # number of demand (read+write) MSHR hits
+system.cpu.dcache.demand_mshr_miss_latency      1043563                       # number of demand (read+write) MSHR miss cycles
+system.cpu.dcache.demand_mshr_miss_rate      0.037726                       # mshr miss rate for demand accesses
+system.cpu.dcache.demand_mshr_misses              152                       # number of demand (read+write) MSHR misses
+system.cpu.dcache.fast_writes                       0                       # number of fast writes performed
+system.cpu.dcache.mshr_cap_events                   0                       # number of times MSHR cap was activated
+system.cpu.dcache.no_allocate_misses                0                       # Number of misses that were no-allocate
+system.cpu.dcache.overall_accesses               4029                       # number of overall (read+write) accesses
+system.cpu.dcache.overall_avg_miss_latency  7629.664273                       # average overall miss latency
+system.cpu.dcache.overall_avg_mshr_miss_latency  6865.546053                       # average overall mshr miss latency
+system.cpu.dcache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.dcache.overall_hits                   3472                       # number of overall hits
+system.cpu.dcache.overall_miss_latency        4249723                       # number of overall miss cycles
+system.cpu.dcache.overall_miss_rate          0.138248                       # miss rate for overall accesses
+system.cpu.dcache.overall_misses                  557                       # number of overall misses
+system.cpu.dcache.overall_mshr_hits               405                       # number of overall MSHR hits
+system.cpu.dcache.overall_mshr_miss_latency      1043563                       # number of overall MSHR miss cycles
+system.cpu.dcache.overall_mshr_miss_rate     0.037726                       # mshr miss rate for overall accesses
+system.cpu.dcache.overall_mshr_misses             152                       # number of overall MSHR misses
+system.cpu.dcache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.dcache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.dcache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.dcache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.dcache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.dcache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.dcache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.dcache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.dcache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.dcache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.dcache.replacements                      0                       # number of replacements
+system.cpu.dcache.sampled_refs                    152                       # Sample count of references to valid blocks.
+system.cpu.dcache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.dcache.tagsinuse                 90.938737                       # Cycle average of tags in use
+system.cpu.dcache.total_refs                     3478                       # Total number of references to valid blocks.
+system.cpu.dcache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
+system.cpu.dcache.writebacks                        0                       # number of writebacks
+system.cpu.decode.DECODE:BlockedCycles         192719                       # Number of cycles decode is blocked
+system.cpu.decode.DECODE:DecodedInsts           39774                       # Number of instructions handled by decode
+system.cpu.decode.DECODE:IdleCycles             20128                       # Number of cycles decode is idle
+system.cpu.decode.DECODE:RunCycles               8238                       # Number of cycles decode is running
+system.cpu.decode.DECODE:SquashCycles            3162                       # Number of cycles decode is squashing
+system.cpu.decode.DECODE:UnblockCycles            264                       # Number of cycles decode is unblocking
+system.cpu.fetch.Branches                        7846                       # Number of branches that fetch encountered
+system.cpu.fetch.CacheLines                      5085                       # Number of cache lines fetched
+system.cpu.fetch.Cycles                         14399                       # Number of cycles fetch has run and was not squashing or blocked
+system.cpu.fetch.IcacheSquashes                   745                       # Number of outstanding Icache misses that were squashed
+system.cpu.fetch.Insts                          43304                       # Number of instructions fetch has processed
+system.cpu.fetch.SquashCycles                    2134                       # Number of cycles fetch has spent squashing
+system.cpu.fetch.branchRate                  0.034947                       # Number of branch fetches per cycle
+system.cpu.fetch.icacheStallCycles               5085                       # Number of cycles fetch is stalled on an Icache miss
+system.cpu.fetch.predictedBranches               2990                       # Number of branches that fetch has predicted taken
+system.cpu.fetch.rate                        0.192881                       # Number of inst fetches per cycle
+system.cpu.fetch.rateDist.start_dist                           # Number of instructions fetched each cycle (Total)
+system.cpu.fetch.rateDist.samples              224511                      
+system.cpu.fetch.rateDist.min_value                 0                      
+                               0       215198   9585.19%           
+                               1         2258    100.57%           
+                               2          627     27.93%           
+                               3          958     42.67%           
+                               4          553     24.63%           
+                               5          816     36.35%           
+                               6          951     42.36%           
+                               7          280     12.47%           
+                               8         2870    127.83%           
+system.cpu.fetch.rateDist.max_value                 8                      
+system.cpu.fetch.rateDist.end_dist
+
+system.cpu.icache.ReadReq_accesses               5085                       # number of ReadReq accesses(hits+misses)
+system.cpu.icache.ReadReq_avg_miss_latency  5148.266776                       # average ReadReq miss latency
+system.cpu.icache.ReadReq_avg_mshr_miss_latency  4502.972752                       # average ReadReq mshr miss latency
+system.cpu.icache.ReadReq_hits                   4474                       # number of ReadReq hits
+system.cpu.icache.ReadReq_miss_latency        3145591                       # number of ReadReq miss cycles
+system.cpu.icache.ReadReq_miss_rate          0.120157                       # miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_misses                  611                       # number of ReadReq misses
+system.cpu.icache.ReadReq_mshr_hits               244                       # number of ReadReq MSHR hits
+system.cpu.icache.ReadReq_mshr_miss_latency      1652591                       # number of ReadReq MSHR miss cycles
+system.cpu.icache.ReadReq_mshr_miss_rate     0.072173                       # mshr miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_mshr_misses             367                       # number of ReadReq MSHR misses
+system.cpu.icache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.icache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.icache.avg_refs                  12.325069                       # Average number of references to valid blocks.
+system.cpu.icache.blocked_no_mshrs                  0                       # number of cycles access was blocked
+system.cpu.icache.blocked_no_targets                0                       # number of cycles access was blocked
+system.cpu.icache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.icache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.icache.cache_copies                      0                       # number of cache copies performed
+system.cpu.icache.demand_accesses                5085                       # number of demand (read+write) accesses
+system.cpu.icache.demand_avg_miss_latency  5148.266776                       # average overall miss latency
+system.cpu.icache.demand_avg_mshr_miss_latency  4502.972752                       # average overall mshr miss latency
+system.cpu.icache.demand_hits                    4474                       # number of demand (read+write) hits
+system.cpu.icache.demand_miss_latency         3145591                       # number of demand (read+write) miss cycles
+system.cpu.icache.demand_miss_rate           0.120157                       # miss rate for demand accesses
+system.cpu.icache.demand_misses                   611                       # number of demand (read+write) misses
+system.cpu.icache.demand_mshr_hits                244                       # number of demand (read+write) MSHR hits
+system.cpu.icache.demand_mshr_miss_latency      1652591                       # number of demand (read+write) MSHR miss cycles
+system.cpu.icache.demand_mshr_miss_rate      0.072173                       # mshr miss rate for demand accesses
+system.cpu.icache.demand_mshr_misses              367                       # number of demand (read+write) MSHR misses
+system.cpu.icache.fast_writes                       0                       # number of fast writes performed
+system.cpu.icache.mshr_cap_events                   0                       # number of times MSHR cap was activated
+system.cpu.icache.no_allocate_misses                0                       # Number of misses that were no-allocate
+system.cpu.icache.overall_accesses               5085                       # number of overall (read+write) accesses
+system.cpu.icache.overall_avg_miss_latency  5148.266776                       # average overall miss latency
+system.cpu.icache.overall_avg_mshr_miss_latency  4502.972752                       # average overall mshr miss latency
+system.cpu.icache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.icache.overall_hits                   4474                       # number of overall hits
+system.cpu.icache.overall_miss_latency        3145591                       # number of overall miss cycles
+system.cpu.icache.overall_miss_rate          0.120157                       # miss rate for overall accesses
+system.cpu.icache.overall_misses                  611                       # number of overall misses
+system.cpu.icache.overall_mshr_hits               244                       # number of overall MSHR hits
+system.cpu.icache.overall_mshr_miss_latency      1652591                       # number of overall MSHR miss cycles
+system.cpu.icache.overall_mshr_miss_rate     0.072173                       # mshr miss rate for overall accesses
+system.cpu.icache.overall_mshr_misses             367                       # number of overall MSHR misses
+system.cpu.icache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.icache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.icache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.icache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.icache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.icache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.icache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.icache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.icache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.icache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.icache.replacements                      1                       # number of replacements
+system.cpu.icache.sampled_refs                    363                       # Sample count of references to valid blocks.
+system.cpu.icache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.icache.tagsinuse                172.869174                       # Cycle average of tags in use
+system.cpu.icache.total_refs                     4474                       # Total number of references to valid blocks.
+system.cpu.icache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
+system.cpu.icache.writebacks                        0                       # number of writebacks
+system.cpu.idleCycles                         1196701                       # Total number of cycles that the CPU has spent unscheduled due to idling
+system.cpu.iew.EXEC:branches                     3576                       # Number of branches executed
+system.cpu.iew.EXEC:nop                             0                       # number of nop insts executed
+system.cpu.iew.EXEC:rate                     0.092548                       # Inst execution rate
+system.cpu.iew.EXEC:refs                         5257                       # number of memory reference insts executed
+system.cpu.iew.EXEC:stores                       2386                       # Number of stores executed
+system.cpu.iew.EXEC:swp                             0                       # number of swp insts executed
+system.cpu.iew.WB:consumers                      9737                       # num instructions consuming a value
+system.cpu.iew.WB:count                         19769                       # cumulative count of insts written-back
+system.cpu.iew.WB:fanout                     0.790901                       # average fanout of values written-back
+system.cpu.iew.WB:penalized                         0                       # number of instrctions required to write to 'other' IQ
+system.cpu.iew.WB:penalized_rate                    0                       # fraction of instructions written-back that wrote to 'other' IQ
+system.cpu.iew.WB:producers                      7701                       # num instructions producing a value
+system.cpu.iew.WB:rate                       0.088054                       # insts written-back per cycle
+system.cpu.iew.WB:sent                          20061                       # cumulative count of insts sent to commit
+system.cpu.iew.branchMispredicts                 2593                       # Number of branch mispredicts detected at execute
+system.cpu.iew.iewBlockCycles                     476                       # Number of cycles IEW is blocking
+system.cpu.iew.iewDispLoadInsts                  3250                       # Number of dispatched load instructions
+system.cpu.iew.iewDispNonSpecInsts                617                       # Number of dispatched non-speculative instructions
+system.cpu.iew.iewDispSquashedInsts              2705                       # Number of squashed instructions skipped by dispatch
+system.cpu.iew.iewDispStoreInsts                 2817                       # Number of dispatched store instructions
+system.cpu.iew.iewDispatchedInsts               25240                       # Number of instructions dispatched to IQ
+system.cpu.iew.iewExecLoadInsts                  2871                       # Number of load instructions executed
+system.cpu.iew.iewExecSquashedInsts              1780                       # Number of squashed instructions skipped in execute
+system.cpu.iew.iewExecutedInsts                 20778                       # Number of executed instructions
+system.cpu.iew.iewIQFullEvents                      7                       # Number of times the IQ has become full, causing a stall
+system.cpu.iew.iewIdleCycles                        0                       # Number of cycles IEW is idle
+system.cpu.iew.iewLSQFullEvents                     0                       # Number of times the LSQ has become full, causing a stall
+system.cpu.iew.iewSquashCycles                   3162                       # Number of cycles IEW is squashing
+system.cpu.iew.iewUnblockCycles                    35                       # Number of cycles IEW is unblocking
+system.cpu.iew.lsq.thread.0.blockedLoads            0                       # Number of blocked loads due to partial load-store forwarding
+system.cpu.iew.lsq.thread.0.cacheBlocked            0                       # Number of times an access to memory failed due to the cache being blocked
+system.cpu.iew.lsq.thread.0.forwLoads              39                       # Number of loads that had data forwarded from stores
+system.cpu.iew.lsq.thread.0.ignoredResponses            5                       # Number of memory responses ignored because the instruction is squashed
+system.cpu.iew.lsq.thread.0.invAddrLoads            0                       # Number of loads ignored due to an invalid address
+system.cpu.iew.lsq.thread.0.invAddrSwpfs            0                       # Number of software prefetches ignored due to an invalid address
+system.cpu.iew.lsq.thread.0.memOrderViolation           54                       # Number of memory ordering violations
+system.cpu.iew.lsq.thread.0.rescheduledLoads            0                       # Number of loads that were rescheduled
+system.cpu.iew.lsq.thread.0.squashedLoads         1788                       # Number of loads squashed
+system.cpu.iew.lsq.thread.0.squashedStores         1519                       # Number of stores squashed
+system.cpu.iew.memOrderViolationEvents             54                       # Number of memory order violations
+system.cpu.iew.predictedNotTakenIncorrect          962                       # Number of branches that were predicted not taken incorrectly
+system.cpu.iew.predictedTakenIncorrect           1631                       # Number of branches that were predicted taken incorrectly
+system.cpu.ipc                               0.007723                       # IPC: Instructions Per Cycle
+system.cpu.ipc_total                         0.007723                       # IPC: Total IPC of All Threads
+system.cpu.iq.ISSUE:FU_type_0                   22558                       # Type of FU issued
+system.cpu.iq.ISSUE:FU_type_0.start_dist
+                          (null)         1831      8.12%            # Type of FU issued
+                          IntAlu        15054     66.73%            # Type of FU issued
+                         IntMult            0      0.00%            # Type of FU issued
+                          IntDiv            0      0.00%            # Type of FU issued
+                        FloatAdd            0      0.00%            # Type of FU issued
+                        FloatCmp            0      0.00%            # Type of FU issued
+                        FloatCvt            0      0.00%            # Type of FU issued
+                       FloatMult            0      0.00%            # Type of FU issued
+                        FloatDiv            0      0.00%            # Type of FU issued
+                       FloatSqrt            0      0.00%            # Type of FU issued
+                         MemRead         3091     13.70%            # Type of FU issued
+                        MemWrite         2582     11.45%            # Type of FU issued
+                       IprAccess            0      0.00%            # Type of FU issued
+                    InstPrefetch            0      0.00%            # Type of FU issued
+system.cpu.iq.ISSUE:FU_type_0.end_dist
+system.cpu.iq.ISSUE:fu_busy_cnt                   162                       # FU busy when requested
+system.cpu.iq.ISSUE:fu_busy_rate             0.007181                       # FU busy rate (busy events/executed inst)
+system.cpu.iq.ISSUE:fu_full.start_dist
+                          (null)            0      0.00%            # attempts to use FU when none available
+                          IntAlu           42     25.93%            # attempts to use FU when none available
+                         IntMult            0      0.00%            # attempts to use FU when none available
+                          IntDiv            0      0.00%            # attempts to use FU when none available
+                        FloatAdd            0      0.00%            # attempts to use FU when none available
+                        FloatCmp            0      0.00%            # attempts to use FU when none available
+                        FloatCvt            0      0.00%            # attempts to use FU when none available
+                       FloatMult            0      0.00%            # attempts to use FU when none available
+                        FloatDiv            0      0.00%            # attempts to use FU when none available
+                       FloatSqrt            0      0.00%            # attempts to use FU when none available
+                         MemRead           14      8.64%            # attempts to use FU when none available
+                        MemWrite          106     65.43%            # attempts to use FU when none available
+                       IprAccess            0      0.00%            # attempts to use FU when none available
+                    InstPrefetch            0      0.00%            # attempts to use FU when none available
+system.cpu.iq.ISSUE:fu_full.end_dist
+system.cpu.iq.ISSUE:issued_per_cycle.start_dist                     # Number of insts issued each cycle
+system.cpu.iq.ISSUE:issued_per_cycle.samples       224511                      
+system.cpu.iq.ISSUE:issued_per_cycle.min_value            0                      
+                               0       215315   9590.40%           
+                               1         4124    183.69%           
+                               2         1297     57.77%           
+                               3         1306     58.17%           
+                               4         1190     53.00%           
+                               5          707     31.49%           
+                               6          433     19.29%           
+                               7           83      3.70%           
+                               8           56      2.49%           
+system.cpu.iq.ISSUE:issued_per_cycle.max_value            8                      
+system.cpu.iq.ISSUE:issued_per_cycle.end_dist
+
+system.cpu.iq.ISSUE:rate                     0.100476                       # Inst issue rate
+system.cpu.iq.iqInstsAdded                      24623                       # Number of instructions added to the IQ (excludes non-spec)
+system.cpu.iq.iqInstsIssued                     22558                       # Number of instructions issued
+system.cpu.iq.iqNonSpecInstsAdded                 617                       # Number of non-speculative instructions added to the IQ
+system.cpu.iq.iqSquashedInstsExamined           11469                       # Number of squashed instructions iterated over during squash; mainly for profiling
+system.cpu.iq.iqSquashedInstsIssued               174                       # Number of squashed instructions issued
+system.cpu.iq.iqSquashedNonSpecRemoved            290                       # Number of squashed non-spec instructions that were removed
+system.cpu.iq.iqSquashedOperandsExamined         5834                       # Number of squashed operands that are examined and possibly removed from graph
+system.cpu.l2cache.ReadReq_accesses               513                       # number of ReadReq accesses(hits+misses)
+system.cpu.l2cache.ReadReq_avg_miss_latency  4754.779727                       # average ReadReq miss latency
+system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2343.506823                       # average ReadReq mshr miss latency
+system.cpu.l2cache.ReadReq_miss_latency       2439202                       # number of ReadReq miss cycles
+system.cpu.l2cache.ReadReq_miss_rate                1                       # miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_misses                 513                       # number of ReadReq misses
+system.cpu.l2cache.ReadReq_mshr_miss_latency      1202219                       # number of ReadReq MSHR miss cycles
+system.cpu.l2cache.ReadReq_mshr_miss_rate            1                       # mshr miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_mshr_misses            513                       # number of ReadReq MSHR misses
+system.cpu.l2cache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.l2cache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.l2cache.avg_refs                         0                       # Average number of references to valid blocks.
+system.cpu.l2cache.blocked_no_mshrs                 0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_no_targets               0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.l2cache.cache_copies                     0                       # number of cache copies performed
+system.cpu.l2cache.demand_accesses                513                       # number of demand (read+write) accesses
+system.cpu.l2cache.demand_avg_miss_latency  4754.779727                       # average overall miss latency
+system.cpu.l2cache.demand_avg_mshr_miss_latency  2343.506823                       # average overall mshr miss latency
+system.cpu.l2cache.demand_hits                      0                       # number of demand (read+write) hits
+system.cpu.l2cache.demand_miss_latency        2439202                       # number of demand (read+write) miss cycles
+system.cpu.l2cache.demand_miss_rate                 1                       # miss rate for demand accesses
+system.cpu.l2cache.demand_misses                  513                       # number of demand (read+write) misses
+system.cpu.l2cache.demand_mshr_hits                 0                       # number of demand (read+write) MSHR hits
+system.cpu.l2cache.demand_mshr_miss_latency      1202219                       # number of demand (read+write) MSHR miss cycles
+system.cpu.l2cache.demand_mshr_miss_rate            1                       # mshr miss rate for demand accesses
+system.cpu.l2cache.demand_mshr_misses             513                       # number of demand (read+write) MSHR misses
+system.cpu.l2cache.fast_writes                      0                       # number of fast writes performed
+system.cpu.l2cache.mshr_cap_events                  0                       # number of times MSHR cap was activated
+system.cpu.l2cache.no_allocate_misses               0                       # Number of misses that were no-allocate
+system.cpu.l2cache.overall_accesses               513                       # number of overall (read+write) accesses
+system.cpu.l2cache.overall_avg_miss_latency  4754.779727                       # average overall miss latency
+system.cpu.l2cache.overall_avg_mshr_miss_latency  2343.506823                       # average overall mshr miss latency
+system.cpu.l2cache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.l2cache.overall_hits                     0                       # number of overall hits
+system.cpu.l2cache.overall_miss_latency       2439202                       # number of overall miss cycles
+system.cpu.l2cache.overall_miss_rate                1                       # miss rate for overall accesses
+system.cpu.l2cache.overall_misses                 513                       # number of overall misses
+system.cpu.l2cache.overall_mshr_hits                0                       # number of overall MSHR hits
+system.cpu.l2cache.overall_mshr_miss_latency      1202219                       # number of overall MSHR miss cycles
+system.cpu.l2cache.overall_mshr_miss_rate            1                       # mshr miss rate for overall accesses
+system.cpu.l2cache.overall_mshr_misses            513                       # number of overall MSHR misses
+system.cpu.l2cache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.l2cache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.l2cache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.l2cache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.l2cache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.l2cache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.l2cache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.l2cache.replacements                     0                       # number of replacements
+system.cpu.l2cache.sampled_refs                   512                       # Sample count of references to valid blocks.
+system.cpu.l2cache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.l2cache.tagsinuse               262.946375                       # Cycle average of tags in use
+system.cpu.l2cache.total_refs                       0                       # Total number of references to valid blocks.
+system.cpu.l2cache.warmup_cycle                     0                       # Cycle when the warmup percentage was hit.
+system.cpu.l2cache.writebacks                       0                       # number of writebacks
+system.cpu.numCycles                           224511                       # number of cpu cycles simulated
+system.cpu.rename.RENAME:BlockCycles              960                       # Number of cycles rename is blocking
+system.cpu.rename.RENAME:CommittedMaps           9868                       # Number of HB maps that are committed
+system.cpu.rename.RENAME:IQFullEvents               2                       # Number of times rename has blocked due to IQ full
+system.cpu.rename.RENAME:IdleCycles             20098                       # Number of cycles rename is idle
+system.cpu.rename.RENAME:LSQFullEvents            481                       # Number of times rename has blocked due to LSQ full
+system.cpu.rename.RENAME:ROBFullEvents              4                       # Number of times rename has blocked due to ROB full
+system.cpu.rename.RENAME:RenameLookups          46931                       # Number of register rename lookups that rename has made
+system.cpu.rename.RENAME:RenamedInsts           31260                       # Number of instructions processed by rename
+system.cpu.rename.RENAME:RenamedOperands        25831                       # Number of destination operands rename has renamed
+system.cpu.rename.RENAME:RunCycles               7921                       # Number of cycles rename is running
+system.cpu.rename.RENAME:SquashCycles            3162                       # Number of cycles rename is squashing
+system.cpu.rename.RENAME:SquashedInsts           8042                       # Number of squashed instructions processed by rename
+system.cpu.rename.RENAME:UnblockCycles           1212                       # Number of cycles rename is unblocking
+system.cpu.rename.RENAME:UndoneMaps             15963                       # Number of HB maps that are undone due to squashing
+system.cpu.rename.RENAME:serializeStallCycles       190573                       # count of cycles rename stalled for serializing inst
+system.cpu.rename.RENAME:serializingInsts          638                       # count of serializing insts renamed
+system.cpu.rename.RENAME:skidInsts               5594                       # count of insts added to the skid buffer
+system.cpu.rename.RENAME:tempSerializingInsts          629                       # count of temporary serializing insts renamed
+system.cpu.timesIdled                             289                       # Number of times that the entire CPU went into an idle state and unscheduled itself
+system.cpu.workload.PROG:num_syscalls               8                       # Number of system calls
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr
new file mode 100644
index 000000000..48affb0e2
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr
@@ -0,0 +1,4 @@
+warn: More than two loadable segments in ELF object.
+warn: Ignoring segment @ 0x0 length 0x0.
+0: system.remote_gdb.listener: listening for remote gdb on port 7003
+warn: Entering event queue @ 0.  Starting simulation...
diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout
new file mode 100644
index 000000000..6cba2ba7e
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout
@@ -0,0 +1,24 @@
+Begining test of difficult SPARC instructions...
+LDSTUB:		Passed
+SWAP:		Passed
+CAS FAIL:	Passed
+CAS WORK:	Passed
+CASX FAIL:	Passed
+CASX WORK:	Passed
+LDTX:		Passed
+LDTW:		Passed
+STTW:		Passed
+Done
+M5 Simulator System
+
+Copyright (c) 2001-2006
+The Regents of The University of Michigan
+All Rights Reserved
+
+
+M5 compiled Apr  9 2007 03:06:26
+M5 started Mon Apr  9 03:06:54 2007
+M5 executing on zizzer.eecs.umich.edu
+command line: build/SPARC_SE/m5.fast -d build/SPARC_SE/tests/fast/quick/02.insttest/sparc/linux/o3-timing tests/run.py quick/02.insttest/sparc/linux/o3-timing
+Global frequency set at 1000000000000 ticks per second
+Exiting @ tick 1421211 because target called exit()
diff --git a/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.ini b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.ini
new file mode 100644
index 000000000..85d14933a
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.ini
@@ -0,0 +1,187 @@
+[root]
+type=Root
+children=system
+dummy=0
+
+[system]
+type=System
+children=cpu membus physmem
+mem_mode=atomic
+physmem=system.physmem
+
+[system.cpu]
+type=TimingSimpleCPU
+children=dcache icache l2cache toL2Bus workload
+clock=1
+cpu_id=0
+defer_registration=false
+function_trace=false
+function_trace_start=0
+max_insts_all_threads=0
+max_insts_any_thread=0
+max_loads_all_threads=0
+max_loads_any_thread=0
+phase=0
+progress_interval=0
+system=system
+workload=system.cpu.workload
+dcache_port=system.cpu.dcache.cpu_side
+icache_port=system.cpu.icache.cpu_side
+
+[system.cpu.dcache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=262144
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=5
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.dcache_port
+mem_side=system.cpu.toL2Bus.port[1]
+
+[system.cpu.icache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=131072
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=5
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.icache_port
+mem_side=system.cpu.toL2Bus.port[0]
+
+[system.cpu.l2cache]
+type=BaseCache
+adaptive_compression=false
+assoc=2
+block_size=64
+compressed_bus=false
+compression_latency=0
+hash_delay=1
+hit_latency=1
+latency=1
+lifo=false
+max_miss_count=0
+mshrs=10
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+protocol=Null
+repl=Null
+size=2097152
+split=false
+split_size=0
+store_compressed=false
+subblock_size=0
+tgts_per_mshr=5
+trace_addr=0
+two_queue=false
+write_buffers=8
+cpu_side=system.cpu.toL2Bus.port[2]
+mem_side=system.membus.port[1]
+
+[system.cpu.toL2Bus]
+type=Bus
+bus_id=0
+clock=1000
+responder_set=false
+width=64
+port=system.cpu.icache.mem_side system.cpu.dcache.mem_side system.cpu.l2cache.cpu_side
+
+[system.cpu.workload]
+type=LiveProcess
+cmd=insttest
+cwd=
+egid=100
+env=
+euid=100
+executable=tests/test-progs/insttest/bin/sparc/linux/insttest
+gid=100
+input=cin
+output=cout
+pid=100
+ppid=99
+system=system
+uid=100
+
+[system.membus]
+type=Bus
+bus_id=0
+clock=1000
+responder_set=false
+width=64
+port=system.physmem.port system.cpu.l2cache.mem_side
+
+[system.physmem]
+type=PhysicalMemory
+file=
+latency=1
+range=0:134217727
+zero=false
+port=system.membus.port[0]
+
diff --git a/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.out b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.out
new file mode 100644
index 000000000..ec2d1886a
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/config.out
@@ -0,0 +1,178 @@
+[root]
+type=Root
+dummy=0
+
+[system.physmem]
+type=PhysicalMemory
+file=
+range=[0,134217727]
+latency=1
+zero=false
+
+[system]
+type=System
+physmem=system.physmem
+mem_mode=atomic
+
+[system.membus]
+type=Bus
+bus_id=0
+clock=1000
+width=64
+responder_set=false
+
+[system.cpu.workload]
+type=LiveProcess
+cmd=insttest
+executable=tests/test-progs/insttest/bin/sparc/linux/insttest
+input=cin
+output=cout
+env=
+cwd=
+system=system
+uid=100
+euid=100
+gid=100
+egid=100
+pid=100
+ppid=99
+
+[system.cpu]
+type=TimingSimpleCPU
+max_insts_any_thread=0
+max_insts_all_threads=0
+max_loads_any_thread=0
+max_loads_all_threads=0
+progress_interval=0
+system=system
+cpu_id=0
+workload=system.cpu.workload
+clock=1
+phase=0
+defer_registration=false
+// width not specified
+function_trace=false
+function_trace_start=0
+// simulate_stalls not specified
+
+[system.cpu.toL2Bus]
+type=Bus
+bus_id=0
+clock=1000
+width=64
+responder_set=false
+
+[system.cpu.icache]
+type=BaseCache
+size=131072
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=5
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
+[system.cpu.dcache]
+type=BaseCache
+size=262144
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=5
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
+[system.cpu.l2cache]
+type=BaseCache
+size=2097152
+assoc=2
+block_size=64
+latency=1
+mshrs=10
+tgts_per_mshr=5
+write_buffers=8
+prioritizeRequests=false
+protocol=null
+trace_addr=0
+hash_delay=1
+repl=null
+compressed_bus=false
+store_compressed=false
+adaptive_compression=false
+compression_latency=0
+block_size=64
+max_miss_count=0
+addr_range=[0,18446744073709551615]
+split=false
+split_size=0
+lifo=false
+two_queue=false
+prefetch_miss=false
+prefetch_access=false
+prefetcher_size=100
+prefetch_past_page=false
+prefetch_serial_squash=false
+prefetch_latency=10
+prefetch_degree=1
+prefetch_policy=none
+prefetch_cache_check_push=true
+prefetch_use_cpu_id=true
+prefetch_data_accesses_only=false
+hit_latency=1
+
diff --git a/tests/quick/02.insttest/ref/sparc/linux/simple-timing/m5stats.txt b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/m5stats.txt
new file mode 100644
index 000000000..a4396b3da
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/m5stats.txt
@@ -0,0 +1,215 @@
+
+---------- Begin Simulation Statistics ----------
+host_inst_rate                                  39129                       # Simulator instruction rate (inst/s)
+host_mem_usage                                 153232                       # Number of bytes of host memory used
+host_seconds                                     0.28                       # Real time elapsed on the host
+host_tick_rate                                6030675                       # Simulator tick rate (ticks/s)
+sim_freq                                 1000000000000                       # Frequency of simulated ticks
+sim_insts                                       11001                       # Number of instructions simulated
+sim_seconds                                  0.000002                       # Number of seconds simulated
+sim_ticks                                     1698003                       # Number of ticks simulated
+system.cpu.dcache.ReadReq_accesses               1462                       # number of ReadReq accesses(hits+misses)
+system.cpu.dcache.ReadReq_avg_miss_latency  3977.759259                       # average ReadReq miss latency
+system.cpu.dcache.ReadReq_avg_mshr_miss_latency  2977.759259                       # average ReadReq mshr miss latency
+system.cpu.dcache.ReadReq_hits                   1408                       # number of ReadReq hits
+system.cpu.dcache.ReadReq_miss_latency         214799                       # number of ReadReq miss cycles
+system.cpu.dcache.ReadReq_miss_rate          0.036936                       # miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_misses                   54                       # number of ReadReq misses
+system.cpu.dcache.ReadReq_mshr_miss_latency       160799                       # number of ReadReq MSHR miss cycles
+system.cpu.dcache.ReadReq_mshr_miss_rate     0.036936                       # mshr miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_mshr_misses              54                       # number of ReadReq MSHR misses
+system.cpu.dcache.SwapReq_accesses                  6                       # number of SwapReq accesses(hits+misses)
+system.cpu.dcache.SwapReq_hits                      6                       # number of SwapReq hits
+system.cpu.dcache.WriteReq_accesses              1292                       # number of WriteReq accesses(hits+misses)
+system.cpu.dcache.WriteReq_avg_miss_latency  3963.647727                       # average WriteReq miss latency
+system.cpu.dcache.WriteReq_avg_mshr_miss_latency  2963.647727                       # average WriteReq mshr miss latency
+system.cpu.dcache.WriteReq_hits                  1204                       # number of WriteReq hits
+system.cpu.dcache.WriteReq_miss_latency        348801                       # number of WriteReq miss cycles
+system.cpu.dcache.WriteReq_miss_rate         0.068111                       # miss rate for WriteReq accesses
+system.cpu.dcache.WriteReq_misses                  88                       # number of WriteReq misses
+system.cpu.dcache.WriteReq_mshr_miss_latency       260801                       # number of WriteReq MSHR miss cycles
+system.cpu.dcache.WriteReq_mshr_miss_rate     0.068111                       # mshr miss rate for WriteReq accesses
+system.cpu.dcache.WriteReq_mshr_misses             88                       # number of WriteReq MSHR misses
+system.cpu.dcache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.dcache.avg_refs                  18.436620                       # Average number of references to valid blocks.
+system.cpu.dcache.blocked_no_mshrs                  0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_no_targets                0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.dcache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.dcache.cache_copies                      0                       # number of cache copies performed
+system.cpu.dcache.demand_accesses                2754                       # number of demand (read+write) accesses
+system.cpu.dcache.demand_avg_miss_latency  3969.014085                       # average overall miss latency
+system.cpu.dcache.demand_avg_mshr_miss_latency  2969.014085                       # average overall mshr miss latency
+system.cpu.dcache.demand_hits                    2612                       # number of demand (read+write) hits
+system.cpu.dcache.demand_miss_latency          563600                       # number of demand (read+write) miss cycles
+system.cpu.dcache.demand_miss_rate           0.051561                       # miss rate for demand accesses
+system.cpu.dcache.demand_misses                   142                       # number of demand (read+write) misses
+system.cpu.dcache.demand_mshr_hits                  0                       # number of demand (read+write) MSHR hits
+system.cpu.dcache.demand_mshr_miss_latency       421600                       # number of demand (read+write) MSHR miss cycles
+system.cpu.dcache.demand_mshr_miss_rate      0.051561                       # mshr miss rate for demand accesses
+system.cpu.dcache.demand_mshr_misses              142                       # number of demand (read+write) MSHR misses
+system.cpu.dcache.fast_writes                       0                       # number of fast writes performed
+system.cpu.dcache.mshr_cap_events                   0                       # number of times MSHR cap was activated
+system.cpu.dcache.no_allocate_misses                0                       # Number of misses that were no-allocate
+system.cpu.dcache.overall_accesses               2754                       # number of overall (read+write) accesses
+system.cpu.dcache.overall_avg_miss_latency  3969.014085                       # average overall miss latency
+system.cpu.dcache.overall_avg_mshr_miss_latency  2969.014085                       # average overall mshr miss latency
+system.cpu.dcache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.dcache.overall_hits                   2612                       # number of overall hits
+system.cpu.dcache.overall_miss_latency         563600                       # number of overall miss cycles
+system.cpu.dcache.overall_miss_rate          0.051561                       # miss rate for overall accesses
+system.cpu.dcache.overall_misses                  142                       # number of overall misses
+system.cpu.dcache.overall_mshr_hits                 0                       # number of overall MSHR hits
+system.cpu.dcache.overall_mshr_miss_latency       421600                       # number of overall MSHR miss cycles
+system.cpu.dcache.overall_mshr_miss_rate     0.051561                       # mshr miss rate for overall accesses
+system.cpu.dcache.overall_mshr_misses             142                       # number of overall MSHR misses
+system.cpu.dcache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.dcache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.dcache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.dcache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.dcache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.dcache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.dcache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.dcache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.dcache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.dcache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.dcache.replacements                      0                       # number of replacements
+system.cpu.dcache.sampled_refs                    142                       # Sample count of references to valid blocks.
+system.cpu.dcache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.dcache.tagsinuse                 86.872921                       # Cycle average of tags in use
+system.cpu.dcache.total_refs                     2618                       # Total number of references to valid blocks.
+system.cpu.dcache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
+system.cpu.dcache.writebacks                        0                       # number of writebacks
+system.cpu.icache.ReadReq_accesses              11002                       # number of ReadReq accesses(hits+misses)
+system.cpu.icache.ReadReq_avg_miss_latency  3961.367491                       # average ReadReq miss latency
+system.cpu.icache.ReadReq_avg_mshr_miss_latency  2961.367491                       # average ReadReq mshr miss latency
+system.cpu.icache.ReadReq_hits                  10719                       # number of ReadReq hits
+system.cpu.icache.ReadReq_miss_latency        1121067                       # number of ReadReq miss cycles
+system.cpu.icache.ReadReq_miss_rate          0.025723                       # miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_misses                  283                       # number of ReadReq misses
+system.cpu.icache.ReadReq_mshr_miss_latency       838067                       # number of ReadReq MSHR miss cycles
+system.cpu.icache.ReadReq_mshr_miss_rate     0.025723                       # mshr miss rate for ReadReq accesses
+system.cpu.icache.ReadReq_mshr_misses             283                       # number of ReadReq MSHR misses
+system.cpu.icache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.icache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.icache.avg_refs                  37.876325                       # Average number of references to valid blocks.
+system.cpu.icache.blocked_no_mshrs                  0                       # number of cycles access was blocked
+system.cpu.icache.blocked_no_targets                0                       # number of cycles access was blocked
+system.cpu.icache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.icache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.icache.cache_copies                      0                       # number of cache copies performed
+system.cpu.icache.demand_accesses               11002                       # number of demand (read+write) accesses
+system.cpu.icache.demand_avg_miss_latency  3961.367491                       # average overall miss latency
+system.cpu.icache.demand_avg_mshr_miss_latency  2961.367491                       # average overall mshr miss latency
+system.cpu.icache.demand_hits                   10719                       # number of demand (read+write) hits
+system.cpu.icache.demand_miss_latency         1121067                       # number of demand (read+write) miss cycles
+system.cpu.icache.demand_miss_rate           0.025723                       # miss rate for demand accesses
+system.cpu.icache.demand_misses                   283                       # number of demand (read+write) misses
+system.cpu.icache.demand_mshr_hits                  0                       # number of demand (read+write) MSHR hits
+system.cpu.icache.demand_mshr_miss_latency       838067                       # number of demand (read+write) MSHR miss cycles
+system.cpu.icache.demand_mshr_miss_rate      0.025723                       # mshr miss rate for demand accesses
+system.cpu.icache.demand_mshr_misses              283                       # number of demand (read+write) MSHR misses
+system.cpu.icache.fast_writes                       0                       # number of fast writes performed
+system.cpu.icache.mshr_cap_events                   0                       # number of times MSHR cap was activated
+system.cpu.icache.no_allocate_misses                0                       # Number of misses that were no-allocate
+system.cpu.icache.overall_accesses              11002                       # number of overall (read+write) accesses
+system.cpu.icache.overall_avg_miss_latency  3961.367491                       # average overall miss latency
+system.cpu.icache.overall_avg_mshr_miss_latency  2961.367491                       # average overall mshr miss latency
+system.cpu.icache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.icache.overall_hits                  10719                       # number of overall hits
+system.cpu.icache.overall_miss_latency        1121067                       # number of overall miss cycles
+system.cpu.icache.overall_miss_rate          0.025723                       # miss rate for overall accesses
+system.cpu.icache.overall_misses                  283                       # number of overall misses
+system.cpu.icache.overall_mshr_hits                 0                       # number of overall MSHR hits
+system.cpu.icache.overall_mshr_miss_latency       838067                       # number of overall MSHR miss cycles
+system.cpu.icache.overall_mshr_miss_rate     0.025723                       # mshr miss rate for overall accesses
+system.cpu.icache.overall_mshr_misses             283                       # number of overall MSHR misses
+system.cpu.icache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.icache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.icache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.icache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.icache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.icache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.icache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.icache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.icache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.icache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.icache.replacements                      0                       # number of replacements
+system.cpu.icache.sampled_refs                    283                       # Sample count of references to valid blocks.
+system.cpu.icache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.icache.tagsinuse                125.297191                       # Cycle average of tags in use
+system.cpu.icache.total_refs                    10719                       # Total number of references to valid blocks.
+system.cpu.icache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
+system.cpu.icache.writebacks                        0                       # number of writebacks
+system.cpu.idle_fraction                            0                       # Percentage of idle cycles
+system.cpu.l2cache.ReadReq_accesses               423                       # number of ReadReq accesses(hits+misses)
+system.cpu.l2cache.ReadReq_avg_miss_latency  2968.515366                       # average ReadReq miss latency
+system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  1967.515366                       # average ReadReq mshr miss latency
+system.cpu.l2cache.ReadReq_miss_latency       1255682                       # number of ReadReq miss cycles
+system.cpu.l2cache.ReadReq_miss_rate                1                       # miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_misses                 423                       # number of ReadReq misses
+system.cpu.l2cache.ReadReq_mshr_miss_latency       832259                       # number of ReadReq MSHR miss cycles
+system.cpu.l2cache.ReadReq_mshr_miss_rate            1                       # mshr miss rate for ReadReq accesses
+system.cpu.l2cache.ReadReq_mshr_misses            423                       # number of ReadReq MSHR misses
+system.cpu.l2cache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.l2cache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
+system.cpu.l2cache.avg_refs                         0                       # Average number of references to valid blocks.
+system.cpu.l2cache.blocked_no_mshrs                 0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_no_targets               0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
+system.cpu.l2cache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
+system.cpu.l2cache.cache_copies                     0                       # number of cache copies performed
+system.cpu.l2cache.demand_accesses                423                       # number of demand (read+write) accesses
+system.cpu.l2cache.demand_avg_miss_latency  2968.515366                       # average overall miss latency
+system.cpu.l2cache.demand_avg_mshr_miss_latency  1967.515366                       # average overall mshr miss latency
+system.cpu.l2cache.demand_hits                      0                       # number of demand (read+write) hits
+system.cpu.l2cache.demand_miss_latency        1255682                       # number of demand (read+write) miss cycles
+system.cpu.l2cache.demand_miss_rate                 1                       # miss rate for demand accesses
+system.cpu.l2cache.demand_misses                  423                       # number of demand (read+write) misses
+system.cpu.l2cache.demand_mshr_hits                 0                       # number of demand (read+write) MSHR hits
+system.cpu.l2cache.demand_mshr_miss_latency       832259                       # number of demand (read+write) MSHR miss cycles
+system.cpu.l2cache.demand_mshr_miss_rate            1                       # mshr miss rate for demand accesses
+system.cpu.l2cache.demand_mshr_misses             423                       # number of demand (read+write) MSHR misses
+system.cpu.l2cache.fast_writes                      0                       # number of fast writes performed
+system.cpu.l2cache.mshr_cap_events                  0                       # number of times MSHR cap was activated
+system.cpu.l2cache.no_allocate_misses               0                       # Number of misses that were no-allocate
+system.cpu.l2cache.overall_accesses               423                       # number of overall (read+write) accesses
+system.cpu.l2cache.overall_avg_miss_latency  2968.515366                       # average overall miss latency
+system.cpu.l2cache.overall_avg_mshr_miss_latency  1967.515366                       # average overall mshr miss latency
+system.cpu.l2cache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
+system.cpu.l2cache.overall_hits                     0                       # number of overall hits
+system.cpu.l2cache.overall_miss_latency       1255682                       # number of overall miss cycles
+system.cpu.l2cache.overall_miss_rate                1                       # miss rate for overall accesses
+system.cpu.l2cache.overall_misses                 423                       # number of overall misses
+system.cpu.l2cache.overall_mshr_hits                0                       # number of overall MSHR hits
+system.cpu.l2cache.overall_mshr_miss_latency       832259                       # number of overall MSHR miss cycles
+system.cpu.l2cache.overall_mshr_miss_rate            1                       # mshr miss rate for overall accesses
+system.cpu.l2cache.overall_mshr_misses            423                       # number of overall MSHR misses
+system.cpu.l2cache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
+system.cpu.l2cache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_cache            0                       # number of hwpf that were already in the cache
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_mshr            0                       # number of hwpf that were already in mshr
+system.cpu.l2cache.prefetcher.num_hwpf_already_in_prefetcher            0                       # number of hwpf that were already in the prefetch queue
+system.cpu.l2cache.prefetcher.num_hwpf_evicted            0                       # number of hwpf removed due to no buffer left
+system.cpu.l2cache.prefetcher.num_hwpf_identified            0                       # number of hwpf identified
+system.cpu.l2cache.prefetcher.num_hwpf_issued            0                       # number of hwpf issued
+system.cpu.l2cache.prefetcher.num_hwpf_removed_MSHR_hit            0                       # number of hwpf removed because MSHR allocated
+system.cpu.l2cache.prefetcher.num_hwpf_span_page            0                       # number of hwpf spanning a virtual page
+system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss            0                       # number of hwpf that got squashed due to a miss aborting calculation time
+system.cpu.l2cache.replacements                     0                       # number of replacements
+system.cpu.l2cache.sampled_refs                   423                       # Sample count of references to valid blocks.
+system.cpu.l2cache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
+system.cpu.l2cache.tagsinuse               211.742547                       # Cycle average of tags in use
+system.cpu.l2cache.total_refs                       0                       # Total number of references to valid blocks.
+system.cpu.l2cache.warmup_cycle                     0                       # Cycle when the warmup percentage was hit.
+system.cpu.l2cache.writebacks                       0                       # number of writebacks
+system.cpu.not_idle_fraction                        1                       # Percentage of non-idle cycles
+system.cpu.numCycles                          1698003                       # number of cpu cycles simulated
+system.cpu.num_insts                            11001                       # Number of instructions executed
+system.cpu.num_refs                              2760                       # Number of memory references
+system.cpu.workload.PROG:num_syscalls               8                       # Number of system calls
+
+---------- End Simulation Statistics   ----------
diff --git a/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stderr b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stderr
new file mode 100644
index 000000000..fce46c90e
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stderr
@@ -0,0 +1,4 @@
+warn: More than two loadable segments in ELF object.
+warn: Ignoring segment @ 0x0 length 0x0.
+0: system.remote_gdb.listener: listening for remote gdb on port 7000
+warn: Entering event queue @ 0.  Starting simulation...
diff --git a/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stdout b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stdout
new file mode 100644
index 000000000..100a1ebce
--- /dev/null
+++ b/tests/quick/02.insttest/ref/sparc/linux/simple-timing/stdout
@@ -0,0 +1,24 @@
+Begining test of difficult SPARC instructions...
+LDSTUB:		Passed
+SWAP:		Passed
+CAS FAIL:	Passed
+CAS WORK:	Passed
+CASX FAIL:	Passed
+CASX WORK:	Passed
+LDTX:		Passed
+LDTW:		Passed
+STTW:		Passed
+Done
+M5 Simulator System
+
+Copyright (c) 2001-2006
+The Regents of The University of Michigan
+All Rights Reserved
+
+
+M5 compiled Apr  8 2007 05:25:15
+M5 started Sun Apr  8 22:54:12 2007
+M5 executing on zizzer.eecs.umich.edu
+command line: build/SPARC_SE/m5.fast -d build/SPARC_SE/tests/fast/quick/02.insttest/sparc/linux/simple-timing tests/run.py quick/02.insttest/sparc/linux/simple-timing
+Global frequency set at 1000000000000 ticks per second
+Exiting @ tick 1698003 because target called exit()