29 files changed, 869 insertions, 391 deletions
diff --git a/src/SConscript b/src/SConscript
index cad0736c5..0ee144747 100755
--- a/src/SConscript
+++ b/src/SConscript
@@ -446,7 +446,7 @@ def makeInfoPyFile(target, source, env):
 
 # Generate a file that wraps the basic top level files
 env.Command('python/m5/info.py',
-            [ '#/AUTHORS', '#/LICENSE', '#/README', '#/RELEASE_NOTES' ],
+            [ '#/AUTHORS', '#/LICENSE', '#/README', ],
             MakeAction(makeInfoPyFile, Transform("INFO")))
 PySource('m5', 'python/m5/info.py')
 
diff --git a/src/arch/generic/debugfaults.hh b/src/arch/generic/debugfaults.hh
new file mode 100644
index 000000000..acffadc34
--- /dev/null
+++ b/src/arch/generic/debugfaults.hh
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2010 Advanced Micro Devices
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#ifndef __ARCH_GENERIC_DEBUGFAULTS_HH__
+#define __ARCH_GENERIC_DEBUGFAULTS_HH__
+
+#include "base/misc.hh"
+#include "sim/faults.hh"
+
+#include <string>
+
+namespace GenericISA
+{
+class M5DebugFault : public FaultBase
+{
+  public:
+    enum DebugFunc
+    {
+        PanicFunc,
+        FatalFunc,
+        WarnFunc,
+        WarnOnceFunc
+    };
+
+  protected:
+    std::string message;
+    DebugFunc func;
+
+  public:
+    M5DebugFault(DebugFunc _func, std::string _message) :
+        message(_message), func(_func)
+    {}
+
+    FaultName
+    name() const
+    {
+        switch (func) {
+          case PanicFunc:
+            return "panic fault";
+          case FatalFunc:
+            return "fatal fault";
+          case WarnFunc:
+            return "warn fault";
+          case WarnOnceFunc:
+            return "warn_once fault";
+          default:
+            panic("unrecognized debug function number\n");
+        }
+    }
+
+    void
+    invoke(ThreadContext *tc,
+            StaticInstPtr inst = StaticInst::nullStaticInstPtr)
+    {
+        switch (func) {
+          case PanicFunc:
+            panic(message);
+            break;
+          case FatalFunc:
+            fatal(message);
+            break;
+          case WarnFunc:
+            warn(message);
+            break;
+          case WarnOnceFunc:
+            warn_once(message);
+            break;
+          default:
+            panic("unrecognized debug function number\n");
+        }
+    }
+};
+} // namespace GenericISA
+
+#endif // __ARCH_GENERIC_DEBUGFAULTS_HH__
diff --git a/src/arch/x86/SConscript b/src/arch/x86/SConscript
index 27de9da11..9cb774647 100644
--- a/src/arch/x86/SConscript
+++ b/src/arch/x86/SConscript
@@ -46,6 +46,7 @@ if env['TARGET_ISA'] == 'x86':
     Source('cpuid.cc')
     Source('emulenv.cc')
     Source('faults.cc')
+    Source('insts/badmicroop.cc')
     Source('insts/microfpop.cc')
     Source('insts/microldstop.cc')
     Source('insts/micromediaop.cc')
diff --git a/src/arch/x86/insts/badmicroop.cc b/src/arch/x86/insts/badmicroop.cc
new file mode 100644
index 000000000..ef493f250
--- /dev/null
+++ b/src/arch/x86/insts/badmicroop.cc
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011 Advanced Micro Devices
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#include "arch/x86/insts/badmicroop.hh"
+#include "arch/x86/isa_traits.hh"
+#include "arch/x86/decoder.hh"
+
+namespace X86ISA
+{
+
+// This microop needs to be allocated on the heap even though it could
+// theoretically be statically allocated. The reference counted pointer would
+// try to delete the static memory when it was destructed.
+const StaticInstPtr badMicroop =
+    new X86ISAInst::MicroPanic(NoopMachInst, "BAD",
+        StaticInst::IsMicroop | StaticInst::IsLastMicroop,
+        "Invalid microop!", 0);
+
+} // namespace X86ISA
diff --git a/src/arch/x86/insts/badmicroop.hh b/src/arch/x86/insts/badmicroop.hh
new file mode 100644
index 000000000..57fe242c4
--- /dev/null
+++ b/src/arch/x86/insts/badmicroop.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2011 Advanced Micro Devices
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Gabe Black
+ */
+
+#ifndef __ARCH_X86_INSTS_BADMICROOP_HH__
+#define __ARCH_X86_INSTS_BADMICROOP_HH__
+
+class StaticInstPtr;
+
+namespace X86ISA
+{
+
+extern const StaticInstPtr badMicroop;
+
+} // namespace X86ISA
+
+#endif //__ARCH_X86_INSTS_BADMICROOP_HH__
diff --git a/src/arch/x86/insts/macroop.hh b/src/arch/x86/insts/macroop.hh
index fcf051a37..4f4176b77 100644
--- a/src/arch/x86/insts/macroop.hh
+++ b/src/arch/x86/insts/macroop.hh
@@ -41,6 +41,7 @@
 #define __ARCH_X86_INSTS_MACROOP_HH__
 
 #include "arch/x86/emulenv.hh"
+#include "arch/x86/insts/badmicroop.hh"
 #include "arch/x86/types.hh"
 #include "arch/x86/insts/static_inst.hh"
 
@@ -76,8 +77,10 @@ class MacroopBase : public X86StaticInst
     StaticInstPtr
     fetchMicroop(MicroPC microPC) const
     {
-        assert(microPC < numMicroops);
-        return microops[microPC];
+        if (microPC >= numMicroops)
+            return badMicroop;
+        else
+            return microops[microPC];
     }
 
     std::string
diff --git a/src/arch/x86/insts/microregop.cc b/src/arch/x86/insts/microregop.cc
index 6aee87449..dedea0f3d 100644
--- a/src/arch/x86/insts/microregop.cc
+++ b/src/arch/x86/insts/microregop.cc
@@ -50,9 +50,6 @@ namespace X86ISA
             bool subtract) const
     {
         DPRINTF(X86, "flagMask = %#x\n", flagMask);
-        if (_destRegIdx[0] & IntFoldBit) {
-            _dest >>= 8;
-        }
         uint64_t flags = oldFlags & ~flagMask;
         if(flagMask & (ECFBit | CFBit))
         {
diff --git a/src/arch/x86/isa/includes.isa b/src/arch/x86/isa/includes.isa
index 58b1fbc62..674e69e98 100644
--- a/src/arch/x86/isa/includes.isa
+++ b/src/arch/x86/isa/includes.isa
@@ -53,6 +53,7 @@ output header {{
 #include <sstream>
 #include <iostream>
 
+#include "arch/generic/debugfaults.hh"
 #include "arch/x86/emulenv.hh"
 #include "arch/x86/insts/macroop.hh"
 #include "arch/x86/insts/microfpop.hh"
@@ -113,6 +114,7 @@ output exec {{
 #include "arch/x86/regs/misc.hh"
 #include "arch/x86/tlb.hh"
 #include "base/bigint.hh"
+#include "base/compiler.hh"
 #include "base/condcodes.hh"
 #include "cpu/base.hh"
 #include "cpu/exetrace.hh"
diff --git a/src/arch/x86/isa/microops/debug.isa b/src/arch/x86/isa/microops/debug.isa
index 4b2ecdd5a..220c1af97 100644
--- a/src/arch/x86/isa/microops/debug.isa
+++ b/src/arch/x86/isa/microops/debug.isa
@@ -45,16 +45,29 @@ output header {{
     class MicroDebugBase : public X86ISA::X86MicroopBase
     {
       protected:
+        typedef GenericISA::M5DebugFault::DebugFunc DebugFunc;
+        DebugFunc func;
         std::string message;
         uint8_t cc;
 
       public:
-        MicroDebugBase(ExtMachInst _machInst, const char * mnem,
+        MicroDebugBase(ExtMachInst machInst, const char * mnem,
                 const char * instMnem, uint64_t setFlags,
-                std::string _message, uint8_t _cc);
+                DebugFunc _func, std::string _message, uint8_t _cc) :
+            X86MicroopBase(machInst, mnem, instMnem, setFlags, No_OpClass),
+                    func(_func), message(_message), cc(_cc)
+        {}
 
-        std::string generateDisassembly(Addr pc,
-                const SymbolTable *symtab) const;
+        std::string
+        generateDisassembly(Addr pc, const SymbolTable *symtab) const
+        {
+            std::stringstream response;
+
+            printMnemonic(response, instMnem, mnemonic);
+            response << "\"" << message << "\"";
+
+            return response.str();
+        }
     };
 }};
 
@@ -70,53 +83,31 @@ def template MicroDebugDeclare {{
 }};
 
 def template MicroDebugExecute {{
-        Fault %(class_name)s::execute(%(CPU_exec_context)s *xc,
+        Fault
+        %(class_name)s::execute(%(CPU_exec_context)s *xc,
                 Trace::InstRecord *traceData) const
         {
             %(op_decl)s
             %(op_rd)s
             if (%(cond_test)s) {
-                %(func)s("%s\n", message);
+                return new GenericISA::M5DebugFault(func, message);
+            } else {
+                return NoFault;
             }
-            return NoFault;
         }
 }};
 
-output decoder {{
-    inline MicroDebugBase::MicroDebugBase(
-            ExtMachInst machInst, const char * mnem, const char * instMnem,
-            uint64_t setFlags, std::string _message, uint8_t _cc) :
-        X86MicroopBase(machInst, mnem, instMnem,
-                setFlags, No_OpClass),
-                message(_message), cc(_cc)
-    {
-    }
-}};
-
 def template MicroDebugConstructor {{
-    inline %(class_name)s::%(class_name)s(
+    %(class_name)s::%(class_name)s(
             ExtMachInst machInst, const char * instMnem, uint64_t setFlags,
             std::string _message, uint8_t _cc) :
         %(base_class)s(machInst, "%(func)s", instMnem,
-                setFlags, _message, _cc)
+                setFlags, %(func_num)s, _message, _cc)
     {
         %(constructor)s;
     }
 }};
 
-output decoder {{
-    std::string MicroDebugBase::generateDisassembly(Addr pc,
-            const SymbolTable *symtab) const
-    {
-        std::stringstream response;
-
-        printMnemonic(response, instMnem, mnemonic);
-        response << "\"" << message << "\"";
-
-        return response.str();
-    }
-}};
-
 let {{
     class MicroDebug(X86Microop):
         def __init__(self, message, flags=None):
@@ -142,13 +133,14 @@ let {{
     header_output = ""
     decoder_output = ""
 
-    def buildDebugMicro(func):
+    def buildDebugMicro(func, func_num):
         global exec_output, header_output, decoder_output
 
         iop = InstObjParams(func, "Micro%sFlags" % func.capitalize(),
                 "MicroDebugBase",
                 {"code": "",
                  "func": func,
+                 "func_num": "GenericISA::M5DebugFault::%s" % func_num,
                  "cond_test": "checkCondition(ccFlagBits, cc)"})
         exec_output += MicroDebugExecute.subst(iop)
         header_output += MicroDebugDeclare.subst(iop)
@@ -158,6 +150,7 @@ let {{
                 "MicroDebugBase",
                 {"code": "",
                  "func": func,
+                 "func_num": "GenericISA::M5DebugFault::%s" % func_num,
                  "cond_test": "true"})
         exec_output += MicroDebugExecute.subst(iop)
         header_output += MicroDebugDeclare.subst(iop)
@@ -169,8 +162,8 @@ let {{
         global microopClasses
         microopClasses[func] = MicroDebugChild
 
-    buildDebugMicro("panic")
-    buildDebugMicro("fatal")
-    buildDebugMicro("warn")
-    buildDebugMicro("warn_once")
+    buildDebugMicro("panic", "PanicFunc")
+    buildDebugMicro("fatal", "FatalFunc")
+    buildDebugMicro("warn", "WarnFunc")
+    buildDebugMicro("warn_once", "WarnOnceFunc")
 }};
diff --git a/src/arch/x86/isa/microops/ldstop.isa b/src/arch/x86/isa/microops/ldstop.isa
index 216a74c6c..cd649d644 100644
--- a/src/arch/x86/isa/microops/ldstop.isa
+++ b/src/arch/x86/isa/microops/ldstop.isa
@@ -301,6 +301,46 @@ let {{
                 "dataSize" : self.dataSize, "addressSize" : self.addressSize,
                 "memFlags" : self.memFlags}
             return allocator
+
+    class BigLdStOp(X86Microop):
+        def __init__(self, data, segment, addr, disp,
+                dataSize, addressSize, baseFlags, atCPL0, prefetch):
+            self.data = data
+            [self.scale, self.index, self.base] = addr
+            self.disp = disp
+            self.segment = segment
+            self.dataSize = dataSize
+            self.addressSize = addressSize
+            self.memFlags = baseFlags
+            if atCPL0:
+                self.memFlags += " | (CPL0FlagBit << FlagShift)"
+            if prefetch:
+                self.memFlags += " | Request::PREFETCH"
+            self.memFlags += " | (machInst.legacy.addr ? " + \
+                             "(AddrSizeFlagBit << FlagShift) : 0)"
+
+        def getAllocator(self, microFlags):
+            allocString = '''
+                (%(dataSize)s >= 4) ?
+                    (StaticInstPtr)(new %(class_name)sBig(machInst,
+                        macrocodeBlock, %(flags)s, %(scale)s, %(index)s,
+                        %(base)s, %(disp)s, %(segment)s, %(data)s,
+                        %(dataSize)s, %(addressSize)s, %(memFlags)s)) :
+                    (StaticInstPtr)(new %(class_name)s(machInst,
+                        macrocodeBlock, %(flags)s, %(scale)s, %(index)s,
+                        %(base)s, %(disp)s, %(segment)s, %(data)s,
+                        %(dataSize)s, %(addressSize)s, %(memFlags)s))
+            '''
+            allocator = allocString % {
+                "class_name" : self.className,
+                "flags" : self.microFlagsText(microFlags),
+                "scale" : self.scale, "index" : self.index,
+                "base" : self.base,
+                "disp" : self.disp,
+                "segment" : self.segment, "data" : self.data,
+                "dataSize" : self.dataSize, "addressSize" : self.addressSize,
+                "memFlags" : self.memFlags}
+            return allocator
 }};
 
 let {{
@@ -315,7 +355,8 @@ let {{
     EA = bits(SegBase + scale * Index + Base + disp, addressSize * 8 - 1, 0);
     '''
 
-    def defineMicroLoadOp(mnemonic, code, mem_flags="0"):
+    def defineMicroLoadOp(mnemonic, code, bigCode='',
+                          mem_flags="0", big=True):
         global header_output
         global decoder_output
         global exec_output
@@ -324,16 +365,22 @@ let {{
         name = mnemonic.lower()
 
         # Build up the all register version of this micro op
-        iop = InstObjParams(name, Name, 'X86ISA::LdStOp',
-                {"code": code,
-                 "ea_code": calculateEA})
-        header_output += MicroLdStOpDeclare.subst(iop)
-        decoder_output += MicroLdStOpConstructor.subst(iop)
-        exec_output += MicroLoadExecute.subst(iop)
-        exec_output += MicroLoadInitiateAcc.subst(iop)
-        exec_output += MicroLoadCompleteAcc.subst(iop)
-
-        class LoadOp(LdStOp):
+        iops = [InstObjParams(name, Name, 'X86ISA::LdStOp',
+                {"code": code, "ea_code": calculateEA})]
+        if big:
+            iops += [InstObjParams(name, Name + "Big", 'X86ISA::LdStOp',
+                     {"code": bigCode, "ea_code": calculateEA})]
+        for iop in iops:
+            header_output += MicroLdStOpDeclare.subst(iop)
+            decoder_output += MicroLdStOpConstructor.subst(iop)
+            exec_output += MicroLoadExecute.subst(iop)
+            exec_output += MicroLoadInitiateAcc.subst(iop)
+            exec_output += MicroLoadCompleteAcc.subst(iop)
+
+        base = LdStOp
+        if big:
+            base = BigLdStOp
+        class LoadOp(base):
             def __init__(self, data, segment, addr, disp = 0,
                     dataSize="env.dataSize",
                     addressSize="env.addressSize",
@@ -346,12 +393,15 @@ let {{
 
         microopClasses[name] = LoadOp
 
-    defineMicroLoadOp('Ld', 'Data = merge(Data, Mem, dataSize);')
+    defineMicroLoadOp('Ld', 'Data = merge(Data, Mem, dataSize);',
+                            'Data = Mem & mask(dataSize * 8);')
     defineMicroLoadOp('Ldst', 'Data = merge(Data, Mem, dataSize);',
-            '(StoreCheck << FlagShift)')
+                              'Data = Mem & mask(dataSize * 8);',
+                      '(StoreCheck << FlagShift)')
     defineMicroLoadOp('Ldstl', 'Data = merge(Data, Mem, dataSize);',
-            '(StoreCheck << FlagShift) | Request::LOCKED')
-    defineMicroLoadOp('Ldfp', 'FpData.uqw = Mem;')
+                               'Data = Mem & mask(dataSize * 8);',
+                      '(StoreCheck << FlagShift) | Request::LOCKED')
+    defineMicroLoadOp('Ldfp', 'FpData.uqw = Mem;', big = False)
 
     def defineMicroStoreOp(mnemonic, code, \
             postCode="", completeCode="", mem_flags="0"):
diff --git a/src/arch/x86/isa/microops/limmop.isa b/src/arch/x86/isa/microops/limmop.isa
index 2871d5a89..ac78b090d 100644
--- a/src/arch/x86/isa/microops/limmop.isa
+++ b/src/arch/x86/isa/microops/limmop.isa
@@ -114,8 +114,16 @@ let {{
             self.dataSize = dataSize
 
         def getAllocator(self, microFlags):
-            allocator = '''new %(class_name)s(machInst, macrocodeBlock,
-                    %(flags)s, %(dest)s, %(imm)s, %(dataSize)s)''' % {
+            allocString = '''
+                (%(dataSize)s >= 4) ?
+                    (StaticInstPtr)(new %(class_name)sBig(machInst,
+                        macrocodeBlock, %(flags)s, %(dest)s, %(imm)s,
+                        %(dataSize)s)) :
+                    (StaticInstPtr)(new %(class_name)s(machInst,
+                        macrocodeBlock, %(flags)s, %(dest)s, %(imm)s,
+                        %(dataSize)s))
+            '''
+            allocator = allocString % {
                 "class_name" : self.className,
                 "mnemonic" : self.mnemonic,
                 "flags" : self.microFlagsText(microFlags),
@@ -152,12 +160,15 @@ let {{
 
 let {{
     # Build up the all register version of this micro op
-    iop = InstObjParams("limm", "Limm", 'X86MicroopBase',
-            {"code" : "DestReg = merge(DestReg, imm, dataSize);"})
-    header_output += MicroLimmOpDeclare.subst(iop)
-    decoder_output += MicroLimmOpConstructor.subst(iop)
-    decoder_output += MicroLimmOpDisassembly.subst(iop)
-    exec_output += MicroLimmOpExecute.subst(iop)
+    iops = [InstObjParams("limm", "Limm", 'X86MicroopBase',
+            {"code" : "DestReg = merge(DestReg, imm, dataSize);"}),
+            InstObjParams("limm", "LimmBig", 'X86MicroopBase',
+            {"code" : "DestReg = imm & mask(dataSize * 8);"})]
+    for iop in iops:
+        header_output += MicroLimmOpDeclare.subst(iop)
+        decoder_output += MicroLimmOpConstructor.subst(iop)
+        decoder_output += MicroLimmOpDisassembly.subst(iop)
+        exec_output += MicroLimmOpExecute.subst(iop)
 
     iop = InstObjParams("lfpimm", "Lfpimm", 'X86MicroopBase',
             {"code" : "FpDestReg.uqw = imm"})
diff --git a/src/arch/x86/isa/microops/regop.isa b/src/arch/x86/isa/microops/regop.isa
index ccfcb3a69..e2a51c127 100644
--- a/src/arch/x86/isa/microops/regop.isa
+++ b/src/arch/x86/isa/microops/regop.isa
@@ -51,6 +51,8 @@ def template MicroRegOpExecute {{
             %(op_decl)s;
             %(op_rd)s;
 
+            IntReg result M5_VAR_USED;
+
             if(%(cond_check)s)
             {
                 %(code)s;
@@ -79,6 +81,8 @@ def template MicroRegOpImmExecute {{
             %(op_decl)s;
             %(op_rd)s;
 
+            IntReg result M5_VAR_USED;
+
             if(%(cond_check)s)
             {
                 %(code)s;
@@ -224,8 +228,8 @@ let {{
             MicroRegOpExecute)
 
     class RegOpMeta(type):
-        def buildCppClasses(self, name, Name, suffix, \
-                code, flag_code, cond_check, else_code, cond_control_flag_init):
+        def buildCppClasses(self, name, Name, suffix, code, big_code, \
+                flag_code, cond_check, else_code, cond_control_flag_init):
 
             # Globals to stick the output in
             global header_output
@@ -235,11 +239,13 @@ let {{
             # Stick all the code together so it can be searched at once
             allCode = "|".join((code, flag_code, cond_check, else_code, 
                                 cond_control_flag_init))
+            allBigCode = "|".join((big_code, flag_code, cond_check, else_code,
+                                   cond_control_flag_init))
 
             # If op2 is used anywhere, make register and immediate versions
             # of this code.
             matcher = re.compile("(?<!\\w)(?P<prefix>s?)op2(?P<typeQual>\\.\\w+)?")
-            match = matcher.search(allCode)
+            match = matcher.search(allCode + allBigCode)
             if match:
                 typeQual = ""
                 if match.group("typeQual"):
@@ -247,6 +253,7 @@ let {{
                 src2_name = "%spsrc2%s" % (match.group("prefix"), typeQual)
                 self.buildCppClasses(name, Name, suffix,
                         matcher.sub(src2_name, code),
+                        matcher.sub(src2_name, big_code),
                         matcher.sub(src2_name, flag_code),
                         matcher.sub(src2_name, cond_check),
                         matcher.sub(src2_name, else_code),
@@ -254,6 +261,7 @@ let {{
                 imm_name = "%simm8" % match.group("prefix")
                 self.buildCppClasses(name + "i", Name, suffix + "Imm",
                         matcher.sub(imm_name, code),
+                        matcher.sub(imm_name, big_code),
                         matcher.sub(imm_name, flag_code),
                         matcher.sub(imm_name, cond_check),
                         matcher.sub(imm_name, else_code),
@@ -264,27 +272,32 @@ let {{
             # a version without it and fix up this version to use it.
             if flag_code != "" or cond_check != "true":
                 self.buildCppClasses(name, Name, suffix,
-                        code, "", "true", else_code, "")
+                        code, big_code, "", "true", else_code, "")
                 suffix = "Flags" + suffix
 
             # If psrc1 or psrc2 is used, we need to actually insert code to
             # compute it.
-            matcher = re.compile("(?<!\w)psrc1(?!\w)")
-            if matcher.search(allCode):
-                code = "uint64_t psrc1 = pick(SrcReg1, 0, dataSize);" + code
-            matcher = re.compile("(?<!\w)psrc2(?!\w)")
-            if matcher.search(allCode):
-                code = "uint64_t psrc2 = pick(SrcReg2, 1, dataSize);" + code
-            # Also make available versions which do sign extension
-            matcher = re.compile("(?<!\w)spsrc1(?!\w)")
-            if matcher.search(allCode):
-                code = "int64_t spsrc1 = signedPick(SrcReg1, 0, dataSize);" + code
-            matcher = re.compile("(?<!\w)spsrc2(?!\w)")
-            if matcher.search(allCode):
-                code = "int64_t spsrc2 = signedPick(SrcReg2, 1, dataSize);" + code
-            matcher = re.compile("(?<!\w)simm8(?!\w)")
-            if matcher.search(allCode):
-                code = "int8_t simm8 = imm8;" + code
+            for (big, all) in ((False, allCode), (True, allBigCode)):
+                prefix = ""
+                for (rex, decl) in (
+                        ("(?<!\w)psrc1(?!\w)",
+                         "uint64_t psrc1 = pick(SrcReg1, 0, dataSize);"),
+                        ("(?<!\w)psrc2(?!\w)",
+                         "uint64_t psrc2 = pick(SrcReg2, 1, dataSize);"),
+                        ("(?<!\w)spsrc1(?!\w)",
+                         "int64_t spsrc1 = signedPick(SrcReg1, 0, dataSize);"),
+                        ("(?<!\w)spsrc2(?!\w)",
+                         "int64_t spsrc2 = signedPick(SrcReg2, 1, dataSize);"),
+                        ("(?<!\w)simm8(?!\w)",
+                         "int8_t simm8 = imm8;")):
+                    matcher = re.compile(rex)
+                    if matcher.search(all):
+                        prefix += decl + "\n"
+                if big:
+                    if big_code != "":
+                        big_code = prefix + big_code
+                else:
+                    code = prefix + code
 
             base = "X86ISA::RegOp"
 
@@ -297,17 +310,26 @@ let {{
                 templates = immTemplates
 
             # Get everything ready for the substitution
-            iop = InstObjParams(name, Name + suffix, base,
+            iops = [InstObjParams(name, Name + suffix, base,
                     {"code" : code,
                      "flag_code" : flag_code,
                      "cond_check" : cond_check,
                      "else_code" : else_code,
-                     "cond_control_flag_init": cond_control_flag_init})
+                     "cond_control_flag_init" : cond_control_flag_init})]
+            if big_code != "":
+                iops += [InstObjParams(name, Name + suffix + "Big", base,
+                         {"code" : big_code,
+                          "flag_code" : flag_code,
+                          "cond_check" : cond_check,
+                          "else_code" : else_code,
+                          "cond_control_flag_init" :
+                              cond_control_flag_init})]
 
             # Generate the actual code (finally!)
-            header_output += templates[0].subst(iop)
-            decoder_output += templates[1].subst(iop)
-            exec_output += templates[2].subst(iop)
+            for iop in iops:
+                header_output += templates[0].subst(iop)
+                decoder_output += templates[1].subst(iop)
+                exec_output += templates[2].subst(iop)
 
 
         def __new__(mcls, Name, bases, dict):
@@ -322,14 +344,16 @@ let {{
                 cls.className = Name
                 cls.base_mnemonic = name
                 code = cls.code
+                big_code = cls.big_code
                 flag_code = cls.flag_code
                 cond_check = cls.cond_check
                 else_code = cls.else_code
                 cond_control_flag_init = cls.cond_control_flag_init
 
                 # Set up the C++ classes
-                mcls.buildCppClasses(cls, name, Name, "", code, flag_code,
-                        cond_check, else_code, cond_control_flag_init)
+                mcls.buildCppClasses(cls, name, Name, "", code, big_code,
+                        flag_code, cond_check, else_code,
+                        cond_control_flag_init)
 
                 # Hook into the microassembler dict
                 global microopClasses
@@ -352,6 +376,7 @@ let {{
         abstract = True
 
         # Default template parameter values
+        big_code = ""
         flag_code = ""
         cond_check = "true"
         else_code = ";"
@@ -372,26 +397,48 @@ let {{
                 self.className += "Flags"
 
         def getAllocator(self, microFlags):
-            className = self.className
-            if self.mnemonic == self.base_mnemonic + 'i':
-                className += "Imm"
-            allocator = '''new %(class_name)s(machInst, macrocodeBlock,
-                    %(flags)s, %(src1)s, %(op2)s, %(dest)s,
-                    %(dataSize)s, %(ext)s)''' % {
-                "class_name" : className,
-                "flags" : self.microFlagsText(microFlags),
-                "src1" : self.src1, "op2" : self.op2,
-                "dest" : self.dest,
-                "dataSize" : self.dataSize,
-                "ext" : self.ext}
-            return allocator
+            if self.big_code != "":
+                className = self.className
+                if self.mnemonic == self.base_mnemonic + 'i':
+                    className += "Imm"
+                allocString = '''
+                    (%(dataSize)s >= 4) ?
+                        (StaticInstPtr)(new %(class_name)sBig(machInst,
+                            macrocodeBlock, %(flags)s, %(src1)s, %(op2)s,
+                            %(dest)s, %(dataSize)s, %(ext)s)) :
+                        (StaticInstPtr)(new %(class_name)s(machInst,
+                            macrocodeBlock, %(flags)s, %(src1)s, %(op2)s,
+                            %(dest)s, %(dataSize)s, %(ext)s))
+                    '''
+                allocator = allocString % {
+                    "class_name" : className,
+                    "flags" : self.microFlagsText(microFlags),
+                    "src1" : self.src1, "op2" : self.op2,
+                    "dest" : self.dest,
+                    "dataSize" : self.dataSize,
+                    "ext" : self.ext}
+                return allocator
+            else:
+                className = self.className
+                if self.mnemonic == self.base_mnemonic + 'i':
+                    className += "Imm"
+                allocator = '''new %(class_name)s(machInst, macrocodeBlock,
+                        %(flags)s, %(src1)s, %(op2)s, %(dest)s,
+                        %(dataSize)s, %(ext)s)''' % {
+                    "class_name" : className,
+                    "flags" : self.microFlagsText(microFlags),
+                    "src1" : self.src1, "op2" : self.op2,
+                    "dest" : self.dest,
+                    "dataSize" : self.dataSize,
+                    "ext" : self.ext}
+                return allocator
 
     class LogicRegOp(RegOp):
         abstract = True
         flag_code = '''
             //Don't have genFlags handle the OF or CF bits
             uint64_t mask = CFBit | ECFBit | OFBit;
-            ccFlagBits = genFlags(ccFlagBits, ext & ~mask, DestReg, psrc1, op2);
+            ccFlagBits = genFlags(ccFlagBits, ext & ~mask, result, psrc1, op2);
             //If a logic microop wants to set these, it wants to set them to 0.
             ccFlagBits &= ~(CFBit & ext);
             ccFlagBits &= ~(ECFBit & ext);
@@ -401,12 +448,12 @@ let {{
     class FlagRegOp(RegOp):
         abstract = True
         flag_code = \
-            "ccFlagBits = genFlags(ccFlagBits, ext, DestReg, psrc1, op2);"
+            "ccFlagBits = genFlags(ccFlagBits, ext, result, psrc1, op2);"
 
     class SubRegOp(RegOp):
         abstract = True
         flag_code = \
-            "ccFlagBits = genFlags(ccFlagBits, ext, DestReg, psrc1, ~op2, true);"
+            "ccFlagBits = genFlags(ccFlagBits, ext, result, psrc1, ~op2, true);"
 
     class CondRegOp(RegOp):
         abstract = True
@@ -428,31 +475,44 @@ let {{
                     src1, src2, flags, dataSize)
 
     class Add(FlagRegOp):
-        code = 'DestReg = merge(DestReg, psrc1 + op2, dataSize);'
+        code = 'DestReg = merge(DestReg, result = (psrc1 + op2), dataSize);'
+        big_code = 'DestReg = result = (psrc1 + op2) & mask(dataSize * 8);'
 
     class Or(LogicRegOp):
-        code = 'DestReg = merge(DestReg, psrc1 | op2, dataSize);'
+        code = 'DestReg = merge(DestReg, result = (psrc1 | op2), dataSize);'
+        big_code = 'DestReg = result = (psrc1 | op2) & mask(dataSize * 8);'
 
     class Adc(FlagRegOp):
         code = '''
             CCFlagBits flags = ccFlagBits;
-            DestReg = merge(DestReg, psrc1 + op2 + flags.cf, dataSize);
+            DestReg = merge(DestReg, result = (psrc1 + op2 + flags.cf), dataSize);
+            '''
+        big_code = '''
+            CCFlagBits flags = ccFlagBits;
+            DestReg = result = (psrc1 + op2 + flags.cf) & mask(dataSize * 8);
             '''
 
     class Sbb(SubRegOp):
         code = '''
             CCFlagBits flags = ccFlagBits;
-            DestReg = merge(DestReg, psrc1 - op2 - flags.cf, dataSize);
+            DestReg = merge(DestReg, result = (psrc1 - op2 - flags.cf), dataSize);
+            '''
+        big_code = '''
+            CCFlagBits flags = ccFlagBits;
+            DestReg = result = (psrc1 - op2 - flags.cf) & mask(dataSize * 8);
             '''
 
     class And(LogicRegOp):
-        code = 'DestReg = merge(DestReg, psrc1 & op2, dataSize)'
+        code = 'DestReg = merge(DestReg, result = (psrc1 & op2), dataSize)'
+        big_code = 'DestReg = result = (psrc1 & op2) & mask(dataSize * 8)'
 
     class Sub(SubRegOp):
-        code = 'DestReg = merge(DestReg, psrc1 - op2, dataSize)'
+        code = 'DestReg = merge(DestReg, result = (psrc1 - op2), dataSize)'
+        big_code = 'DestReg = result = (psrc1 - op2) & mask(dataSize * 8)'
 
     class Xor(LogicRegOp):
-        code = 'DestReg = merge(DestReg, psrc1 ^ op2, dataSize)'
+        code = 'DestReg = merge(DestReg, result = (psrc1 ^ op2), dataSize)'
+        big_code = 'DestReg = result = (psrc1 ^ op2) & mask(dataSize * 8)'
 
     class Mul1s(WrRegOp):
         code = '''
@@ -505,6 +565,7 @@ let {{
 
     class Mulel(RdRegOp):
         code = 'DestReg = merge(SrcReg1, ProdLow, dataSize);'
+        big_code = 'DestReg = ProdLow & mask(dataSize * 8);'
 
     class Muleh(RdRegOp):
         def __init__(self, dest, src1=None, flags=None, dataSize="env.dataSize"):
@@ -513,6 +574,7 @@ let {{
             super(RdRegOp, self).__init__(dest, src1, \
                     "InstRegIndex(NUM_INTREGS)", flags, dataSize)
         code = 'DestReg = merge(SrcReg1, ProdHi, dataSize);'
+        big_code = 'DestReg = ProdHi & mask(dataSize * 8);'
 
     # One or two bit divide
     class Div1(WrRegOp):
@@ -540,7 +602,7 @@ let {{
 
     # Step divide
     class Div2(RegOp):
-        code = '''
+        divCode = '''
             uint64_t dividend = Remainder;
             uint64_t divisor = Divisor;
             uint64_t quotient = Quotient;
@@ -587,11 +649,13 @@ let {{
                 }
             }
             //Keep track of how many bits there are still to pull in.
-            DestReg = merge(DestReg, remaining, dataSize);
+            %s
             //Record the final results
             Remainder = remainder;
             Quotient = quotient;
         '''
+        code = divCode % "DestReg = merge(DestReg, remaining, dataSize);"
+        big_code = divCode % "DestReg = remaining & mask(dataSize * 8);"
         flag_code = '''
             if (remaining == 0)
                 ccFlagBits = ccFlagBits | (ext & EZFBit);
@@ -601,9 +665,11 @@ let {{
 
     class Divq(RdRegOp):
         code = 'DestReg = merge(SrcReg1, Quotient, dataSize);'
+        big_code = 'DestReg = Quotient & mask(dataSize * 8);'
 
     class Divr(RdRegOp):
         code = 'DestReg = merge(SrcReg1, Remainder, dataSize);'
+        big_code = 'DestReg = Remainder & mask(dataSize * 8);'
 
     class Mov(CondRegOp):
         code = 'DestReg = merge(SrcReg1, op2, dataSize)'
@@ -616,6 +682,10 @@ let {{
             uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             DestReg = merge(DestReg, psrc1 << shiftAmt, dataSize);
             '''
+        big_code = '''
+            uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
+            DestReg = (psrc1 << shiftAmt) & mask(dataSize * 8);
+            '''
         flag_code = '''
             // If the shift amount is zero, no flags should be modified.
             if (shiftAmt) {
@@ -641,14 +711,19 @@ let {{
         '''
 
     class Srl(RegOp):
+        # Because what happens to the bits shift -in- on a right shift
+        # is not defined in the C/C++ standard, we have to mask them out
+        # to be sure they're zero.
         code = '''
             uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
-            // Because what happens to the bits shift -in- on a right shift
-            // is not defined in the C/C++ standard, we have to mask them out
-            // to be sure they're zero.
             uint64_t logicalMask = mask(dataSize * 8 - shiftAmt);
             DestReg = merge(DestReg, (psrc1 >> shiftAmt) & logicalMask, dataSize);
             '''
+        big_code = '''
+            uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
+            uint64_t logicalMask = mask(dataSize * 8 - shiftAmt);
+            DestReg = (psrc1 >> shiftAmt) & logicalMask;
+            '''
         flag_code = '''
             // If the shift amount is zero, no flags should be modified.
             if (shiftAmt) {
@@ -671,15 +746,21 @@ let {{
         '''
 
     class Sra(RegOp):
+        # Because what happens to the bits shift -in- on a right shift
+        # is not defined in the C/C++ standard, we have to sign extend
+        # them manually to be sure.
         code = '''
             uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
-            // Because what happens to the bits shift -in- on a right shift
-            // is not defined in the C/C++ standard, we have to sign extend
-            // them manually to be sure.
             uint64_t arithMask = (shiftAmt == 0) ? 0 :
                 -bits(psrc1, dataSize * 8 - 1) << (dataSize * 8 - shiftAmt);
             DestReg = merge(DestReg, (psrc1 >> shiftAmt) | arithMask, dataSize);
             '''
+        big_code = '''
+            uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
+            uint64_t arithMask = (shiftAmt == 0) ? 0 :
+                -bits(psrc1, dataSize * 8 - 1) << (dataSize * 8 - shiftAmt);
+            DestReg = ((psrc1 >> shiftAmt) | arithMask) & mask(dataSize * 8);
+            '''
         flag_code = '''
             // If the shift amount is zero, no flags should be modified.
             if (shiftAmt) {
@@ -704,13 +785,11 @@ let {{
             uint8_t shiftAmt =
                 (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t realShiftAmt = shiftAmt % (dataSize * 8);
-            if(realShiftAmt)
-            {
+            if (realShiftAmt) {
                 uint64_t top = psrc1 << (dataSize * 8 - realShiftAmt);
                 uint64_t bottom = bits(psrc1, dataSize * 8, realShiftAmt);
                 DestReg = merge(DestReg, top | bottom, dataSize);
-            }
-            else
+            } else
                 DestReg = merge(DestReg, DestReg, dataSize);
             '''
         flag_code = '''
@@ -739,16 +818,14 @@ let {{
             uint8_t shiftAmt =
                 (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t realShiftAmt = shiftAmt % (dataSize * 8 + 1);
-            if(realShiftAmt)
-            {
+            if (realShiftAmt) {
                 CCFlagBits flags = ccFlagBits;
                 uint64_t top = flags.cf << (dataSize * 8 - realShiftAmt);
                 if (realShiftAmt > 1)
                     top |= psrc1 << (dataSize * 8 - realShiftAmt + 1);
                 uint64_t bottom = bits(psrc1, dataSize * 8 - 1, realShiftAmt);
                 DestReg = merge(DestReg, top | bottom, dataSize);
-            }
-            else
+            } else
                 DestReg = merge(DestReg, DestReg, dataSize);
             '''
         flag_code = '''
@@ -780,14 +857,12 @@ let {{
             uint8_t shiftAmt =
                 (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t realShiftAmt = shiftAmt % (dataSize * 8);
-            if(realShiftAmt)
-            {
+            if (realShiftAmt) {
                 uint64_t top = psrc1 << realShiftAmt;
                 uint64_t bottom =
                     bits(psrc1, dataSize * 8 - 1, dataSize * 8 - realShiftAmt);
                 DestReg = merge(DestReg, top | bottom, dataSize);
-            }
-            else
+            } else
                 DestReg = merge(DestReg, DestReg, dataSize);
             '''
         flag_code = '''
@@ -816,8 +891,7 @@ let {{
             uint8_t shiftAmt =
                 (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t realShiftAmt = shiftAmt % (dataSize * 8 + 1);
-            if(realShiftAmt)
-            {
+            if (realShiftAmt) {
                 CCFlagBits flags = ccFlagBits;
                 uint64_t top = psrc1 << realShiftAmt;
                 uint64_t bottom = flags.cf << (realShiftAmt - 1);
@@ -826,8 +900,7 @@ let {{
                         bits(psrc1, dataSize * 8 - 1,
                                    dataSize * 8 - realShiftAmt + 1);
                 DestReg = merge(DestReg, top | bottom, dataSize);
-            }
-            else
+            } else
                 DestReg = merge(DestReg, DestReg, dataSize);
             '''
         flag_code = '''
@@ -853,10 +926,10 @@ let {{
         '''
 
     class Sld(RegOp):
-        code = '''
+        sldCode = '''
             uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t dataBits = dataSize * 8;
-            uint8_t realShiftAmt = shiftAmt % (2 * dataBits);
+            uint8_t realShiftAmt = shiftAmt %% (2 * dataBits);
             uint64_t result;
             if (realShiftAmt == 0) {
                 result = psrc1;
@@ -867,8 +940,10 @@ let {{
                 result = (DoubleBits << (realShiftAmt - dataBits)) |
                          (psrc1 >> (2 * dataBits - realShiftAmt));
             }
-            DestReg = merge(DestReg, result, dataSize);
+            %s
             '''
+        code = sldCode % "DestReg = merge(DestReg, result, dataSize);"
+        big_code = sldCode % "DestReg = result & mask(dataSize * 8);"
         flag_code = '''
             // If the shift amount is zero, no flags should be modified.
             if (shiftAmt) {
@@ -899,10 +974,10 @@ let {{
         '''
 
     class Srd(RegOp):
-        code = '''
+        srdCode = '''
             uint8_t shiftAmt = (op2 & ((dataSize == 8) ? mask(6) : mask(5)));
             uint8_t dataBits = dataSize * 8;
-            uint8_t realShiftAmt = shiftAmt % (2 * dataBits);
+            uint8_t realShiftAmt = shiftAmt %% (2 * dataBits);
             uint64_t result;
             if (realShiftAmt == 0) {
                 result = psrc1;
@@ -919,8 +994,10 @@ let {{
                           logicalMask) |
                          (psrc1 << (2 * dataBits - realShiftAmt));
             }
-            DestReg = merge(DestReg, result, dataSize);
+            %s
             '''
+        code = srdCode % "DestReg = merge(DestReg, result, dataSize);"
+        big_code = srdCode % "DestReg = result & mask(dataSize * 8);"
         flag_code = '''
             // If the shift amount is zero, no flags should be modified.
             if (shiftAmt) {
@@ -986,6 +1063,12 @@ let {{
             ccFlagBits = (flag == 0) ? (ccFlagBits | EZFBit) :
                                        (ccFlagBits & ~EZFBit);
             '''
+        big_code = '''
+            int flag = bits(ccFlagBits, imm8);
+            DestReg = flag & mask(dataSize * 8);
+            ccFlagBits = (flag == 0) ? (ccFlagBits | EZFBit) :
+                                       (ccFlagBits & ~EZFBit);
+            '''
         def __init__(self, dest, imm, flags=None, \
                 dataSize="env.dataSize"):
             super(Ruflag, self).__init__(dest, \
@@ -1000,6 +1083,14 @@ let {{
             ccFlagBits = (flag == 0) ? (ccFlagBits | EZFBit) :
                                        (ccFlagBits & ~EZFBit);
             '''
+        big_code = '''
+            MiscReg flagMask = 0x3F7FDD5;
+            MiscReg flags = (nccFlagBits | ccFlagBits) & flagMask;
+            int flag = bits(flags, imm8);
+            DestReg = flag & mask(dataSize * 8);
+            ccFlagBits = (flag == 0) ? (ccFlagBits | EZFBit) :
+                                       (ccFlagBits & ~EZFBit);
+            '''
         def __init__(self, dest, imm, flags=None, \
                 dataSize="env.dataSize"):
             super(Rflag, self).__init__(dest, \
@@ -1015,6 +1106,15 @@ let {{
             val = sign_bit ? (val | ~maskVal) : (val & maskVal);
             DestReg = merge(DestReg, val, dataSize);
             '''
+        big_code = '''
+            IntReg val = psrc1;
+            // Mask the bit position so that it wraps.
+            int bitPos = op2 & (dataSize * 8 - 1);
+            int sign_bit = bits(val, bitPos, bitPos);
+            uint64_t maskVal = mask(bitPos+1);
+            val = sign_bit ? (val | ~maskVal) : (val & maskVal);
+            DestReg = val & mask(dataSize * 8);
+            '''
         flag_code = '''
             if (!sign_bit)
                 ccFlagBits = ccFlagBits &
@@ -1026,12 +1126,13 @@ let {{
 
     class Zext(RegOp):
         code = 'DestReg = merge(DestReg, bits(psrc1, op2, 0), dataSize);'
+        big_code = 'DestReg = bits(psrc1, op2, 0) & mask(dataSize * 8);'
 
     class Rddr(RegOp):
         def __init__(self, dest, src1, flags=None, dataSize="env.dataSize"):
             super(Rddr, self).__init__(dest, \
                     src1, "InstRegIndex(NUM_INTREGS)", flags, dataSize)
-        code = '''
+        rdrCode = '''
             CR4 cr4 = CR4Op;
             DR7 dr7 = DR7Op;
             if ((cr4.de == 1 && (src1 == 4 || src1 == 5)) || src1 >= 8) {
@@ -1039,9 +1140,11 @@ let {{
             } else if (dr7.gd) {
                 fault = new DebugException();
             } else {
-                DestReg = merge(DestReg, DebugSrc1, dataSize);
+                %s
             }
         '''
+        code = rdrCode % "DestReg = merge(DestReg, DebugSrc1, dataSize);"
+        big_code = rdrCode % "DestReg = DebugSrc1 & mask(dataSize * 8);"
 
     class Wrdr(RegOp):
         def __init__(self, dest, src1, flags=None, dataSize="env.dataSize"):
@@ -1066,13 +1169,15 @@ let {{
         def __init__(self, dest, src1, flags=None, dataSize="env.dataSize"):
             super(Rdcr, self).__init__(dest, \
                     src1, "InstRegIndex(NUM_INTREGS)", flags, dataSize)
-        code = '''
+        rdcrCode = '''
             if (src1 == 1 || (src1 > 4 && src1 < 8) || (src1 > 8)) {
                 fault = new InvalidOpcode();
             } else {
-                DestReg = merge(DestReg, ControlSrc1, dataSize);
+                %s
             }
         '''
+        code = rdcrCode % "DestReg = merge(DestReg, ControlSrc1, dataSize);"
+        big_code = rdcrCode % "DestReg = ControlSrc1 & mask(dataSize * 8);"
 
     class Wrcr(RegOp):
         def __init__(self, dest, src1, flags=None, dataSize="env.dataSize"):
@@ -1154,24 +1259,20 @@ let {{
         '''
 
     class Rdbase(SegOp):
-        code = '''
-            DestReg = merge(DestReg, SegBaseSrc1, dataSize);
-        '''
+        code = 'DestReg = merge(DestReg, SegBaseSrc1, dataSize);'
+        big_code = 'DestReg = SegBaseSrc1 & mask(dataSize * 8);'
 
     class Rdlimit(SegOp):
-        code = '''
-            DestReg = merge(DestReg, SegLimitSrc1, dataSize);
-        '''
+        code = 'DestReg = merge(DestReg, SegLimitSrc1, dataSize);'
+        big_code = 'DestReg = SegLimitSrc1 & mask(dataSize * 8);'
 
     class RdAttr(SegOp):
-        code = '''
-            DestReg = merge(DestReg, SegAttrSrc1, dataSize);
-        '''
+        code = 'DestReg = merge(DestReg, SegAttrSrc1, dataSize);'
+        big_code = 'DestReg = SegAttrSrc1 & mask(dataSize * 8);'
 
     class Rdsel(SegOp):
-        code = '''
-            DestReg = merge(DestReg, SegSelSrc1, dataSize);
-        '''
+        code = 'DestReg = merge(DestReg, SegSelSrc1, dataSize);'
+        big_code = 'DestReg = SegSelSrc1 & mask(dataSize * 8);'
 
     class Rdval(RegOp):
         def __init__(self, dest, src1, flags=None, dataSize="env.dataSize"):
diff --git a/src/arch/x86/microcode_rom.hh b/src/arch/x86/microcode_rom.hh
index f8ad410ce..84c503bb9 100644
--- a/src/arch/x86/microcode_rom.hh
+++ b/src/arch/x86/microcode_rom.hh
@@ -32,6 +32,7 @@
 #define __ARCH_X86_MICROCODE_ROM_HH__
 
 #include "arch/x86/emulenv.hh"
+#include "arch/x86/insts/badmicroop.hh"
 #include "cpu/static_inst.hh"
 
 namespace X86ISAInst
@@ -60,8 +61,10 @@ namespace X86ISAInst
         fetchMicroop(MicroPC microPC, StaticInstPtr curMacroop)
         {
             microPC = normalMicroPC(microPC);
-            assert(microPC < numMicroops);
-            return genFuncs[microPC](curMacroop);
+            if (microPC >= numMicroops)
+                return X86ISA::badMicroop;
+            else
+                return genFuncs[microPC](curMacroop);
         }
     };
 }
diff --git a/src/arch/x86/predecoder.hh b/src/arch/x86/predecoder.hh
index c06ec18bc..5c67e28e1 100644
--- a/src/arch/x86/predecoder.hh
+++ b/src/arch/x86/predecoder.hh
@@ -225,7 +225,11 @@ namespace X86ISA
         {
             assert(emiIsReady);
             emiIsReady = false;
-            nextPC.npc(nextPC.pc() + getInstSize());
+            if (!nextPC.size()) {
+                Addr size = getInstSize();
+                nextPC.size(size);
+                nextPC.npc(nextPC.pc() + size);
+            }
             return emi;
         }
     };
diff --git a/src/arch/x86/types.hh b/src/arch/x86/types.hh
index 5a208446a..4641141d3 100644
--- a/src/arch/x86/types.hh
+++ b/src/arch/x86/types.hh
@@ -222,7 +222,61 @@ namespace X86ISA
         return true;
     }
 
-    typedef GenericISA::UPCState<MachInst> PCState;
+    class PCState : public GenericISA::UPCState<MachInst>
+    {
+      protected:
+        typedef GenericISA::UPCState<MachInst> Base;
+
+        uint8_t _size;
+
+      public:
+        void
+        set(Addr val)
+        {
+            Base::set(val);
+            _size = 0;
+        }
+
+        PCState() {}
+        PCState(Addr val) { set(val); }
+
+        uint8_t size() const { return _size; }
+        void size(uint8_t newSize) { _size = newSize; }
+
+        bool
+        branching() const
+        {
+            return this->npc() != this->pc() + size();
+        }
+
+        void
+        advance()
+        {
+            Base::advance();
+            _size = 0;
+        }
+
+        void
+        uEnd()
+        {
+            Base::uEnd();
+            _size = 0;
+        }
+
+        void
+        serialize(std::ostream &os)
+        {
+            Base::serialize(os);
+            SERIALIZE_SCALAR(_size);
+        }
+
+        void
+        unserialize(Checkpoint *cp, const std::string &section)
+        {
+            Base::unserialize(cp, section);
+            UNSERIALIZE_SCALAR(_size);
+        }
+    };
 
     struct CoreSpecific {
         int core_type;
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 2e4e4819e..d2cde496e 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -1070,6 +1070,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
     Addr pcOffset = fetchOffset[tid];
     Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask;
 
+    bool inRom = isRomMicroPC(thisPC.microPC());
+
     // If returning from the delay of a cache miss, then update the status
     // to running, otherwise do the cache access.  Possibly move this up
     // to tick() function.
@@ -1083,7 +1085,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         Addr block_PC = icacheBlockAlignPC(fetchAddr);
 
         // Unless buffer already got the block, fetch it from icache.
-        if (!cacheDataValid[tid] || block_PC != cacheDataPC[tid]) {
+        if (!(cacheDataValid[tid] && block_PC == cacheDataPC[tid]) && !inRom) {
             DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read "
                     "instruction, starting at PC %s.\n", tid, thisPC);
 
@@ -1155,7 +1157,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
            !predictedBranch) {
 
         // If we need to process more memory, do it now.
-        if (!curMacroop && !predecoder.extMachInstReady()) {
+        if (!(curMacroop || inRom) && !predecoder.extMachInstReady()) {
             if (ISA_HAS_DELAY_SLOT && pcOffset == 0) {
                 // Walk past any annulled delay slot instructions.
                 Addr pcAddr = thisPC.instAddr() & BaseCPU::PCMask;
@@ -1181,7 +1183,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         // Extract as many instructions and/or microops as we can from
         // the memory we've processed so far.
         do {
-            if (!curMacroop) {
+            if (!(curMacroop || inRom)) {
                 if (predecoder.extMachInstReady()) {
                     ExtMachInst extMachInst;
 
@@ -1202,8 +1204,13 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                     break;
                 }
             }
-            if (curMacroop) {
-                staticInst = curMacroop->fetchMicroop(thisPC.microPC());
+            if (curMacroop || inRom) {
+                if (inRom) {
+                    staticInst = cpu->microcodeRom.fetchMicroop(
+                            thisPC.microPC(), curMacroop);
+                } else {
+                    staticInst = curMacroop->fetchMicroop(thisPC.microPC());
+                }
                 if (staticInst->isLastMicroop()) {
                     curMacroop = NULL;
                     pcOffset = 0;
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index d6da4b818..aa21a0edc 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -749,7 +749,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     DynInstPtr deferred_mem_inst;
     int total_deferred_mem_issued = 0;
     while (total_deferred_mem_issued < totalWidth &&
-           (deferred_mem_inst = getDeferredMemInstToExecute()) != NULL) {
+           (deferred_mem_inst = getDeferredMemInstToExecute()) != 0) {
         issueToExecuteQueue->access(0)->size++;
         instsToExecute.push_back(deferred_mem_inst);
         total_deferred_mem_issued++;
diff --git a/src/mem/protocol/MESI_CMP_directory-L1cache.sm b/src/mem/protocol/MESI_CMP_directory-L1cache.sm
index 8744a7122..4442cee41 100644
--- a/src/mem/protocol/MESI_CMP_directory-L1cache.sm
+++ b/src/mem/protocol/MESI_CMP_directory-L1cache.sm
@@ -287,20 +287,21 @@ machine(L1Cache, "MSI Directory L1 Cache CMP")
         if (in_msg.Type == CacheRequestType:IFETCH) {
           // ** INSTRUCTION ACCESS ***
 
-          // Check to see if it is in the OTHER L1
-          Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
-          if (is_valid(L1Dcache_entry)) {
-            // The block is in the wrong L1, put the request on the queue to the shared L2
-            trigger(Event:L1_Replacement, in_msg.LineAddress,
-                    L1Dcache_entry, L1_TBEs[in_msg.LineAddress]);
-          }
-
           Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
           if (is_valid(L1Icache_entry)) {
             // The tag matches for the L1, so the L1 asks the L2 for it.
             trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
                     L1Icache_entry, L1_TBEs[in_msg.LineAddress]);
           } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Dcache_entry)) {
+              // The block is in the wrong L1, put the request on the queue to the shared L2
+              trigger(Event:L1_Replacement, in_msg.LineAddress,
+                      L1Dcache_entry, L1_TBEs[in_msg.LineAddress]);
+            }
+
             if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1 so let's see if the L2 has it
               trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
@@ -313,21 +314,23 @@ machine(L1Cache, "MSI Directory L1 Cache CMP")
             }
           }
         } else {
-          // *** DATA ACCESS ***
-          // Check to see if it is in the OTHER L1
-          Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
-          if (is_valid(L1Icache_entry)) {
-            // The block is in the wrong L1, put the request on the queue to the shared L2
-            trigger(Event:L1_Replacement, in_msg.LineAddress,
-                    L1Icache_entry, L1_TBEs[in_msg.LineAddress]);
-          }
 
+          // *** DATA ACCESS ***
           Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
           if (is_valid(L1Dcache_entry)) {
             // The tag matches for the L1, so the L1 ask the L2 for it
             trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
                     L1Dcache_entry, L1_TBEs[in_msg.LineAddress]);
           } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Icache_entry)) {
+              // The block is in the wrong L1, put the request on the queue to the shared L2
+              trigger(Event:L1_Replacement, in_msg.LineAddress,
+                      L1Icache_entry, L1_TBEs[in_msg.LineAddress]);
+            }
+
             if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1 let's see if the L2 has it
               trigger(mandatory_request_type_to_event(in_msg.Type), in_msg.LineAddress,
diff --git a/src/mem/protocol/MOESI_CMP_directory-L1cache.sm b/src/mem/protocol/MOESI_CMP_directory-L1cache.sm
index 4082f23c9..7f0ab62a8 100644
--- a/src/mem/protocol/MOESI_CMP_directory-L1cache.sm
+++ b/src/mem/protocol/MOESI_CMP_directory-L1cache.sm
@@ -338,14 +338,6 @@ machine(L1Cache, "Directory protocol")
         if (in_msg.Type == CacheRequestType:IFETCH) {
           // ** INSTRUCTION ACCESS ***
 
-          Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
-          // Check to see if it is in the OTHER L1
-          if (is_valid(L1Dcache_entry)) {
-            // The block is in the wrong L1, put the request on the queue to the shared L2
-            trigger(Event:L1_Replacement, in_msg.LineAddress, L1Dcache_entry,
-                    TBEs[in_msg.LineAddress]);
-          }
-
           Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
           if (is_valid(L1Icache_entry)) {
             // The tag matches for the L1, so the L1 asks the L2 for it.
@@ -353,6 +345,14 @@ machine(L1Cache, "Directory protocol")
                     in_msg.LineAddress, L1Icache_entry,
                     TBEs[in_msg.LineAddress]);
           } else {
+
+            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+            // Check to see if it is in the OTHER L1
+            if (is_valid(L1Dcache_entry)) {
+              // The block is in the wrong L1, put the request on the queue to the shared L2
+              trigger(Event:L1_Replacement, in_msg.LineAddress, L1Dcache_entry,
+                      TBEs[in_msg.LineAddress]);
+            }
             if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1 so let's see if the L2 has it
               trigger(mandatory_request_type_to_event(in_msg.Type),
@@ -369,14 +369,6 @@ machine(L1Cache, "Directory protocol")
         } else {
           // *** DATA ACCESS ***
 
-          Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
-          // Check to see if it is in the OTHER L1
-          if (is_valid(L1Icache_entry)) {
-            // The block is in the wrong L1, put the request on the queue to the shared L2
-            trigger(Event:L1_Replacement, in_msg.LineAddress,
-                    L1Icache_entry, TBEs[in_msg.LineAddress]);
-          }
-
           Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
           if (is_valid(L1Dcache_entry)) {
             // The tag matches for the L1, so the L1 ask the L2 for it
@@ -384,6 +376,14 @@ machine(L1Cache, "Directory protocol")
                     in_msg.LineAddress, L1Dcache_entry,
                     TBEs[in_msg.LineAddress]);
           } else {
+
+            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+            // Check to see if it is in the OTHER L1
+            if (is_valid(L1Icache_entry)) {
+              // The block is in the wrong L1, put the request on the queue to the shared L2
+              trigger(Event:L1_Replacement, in_msg.LineAddress,
+                      L1Icache_entry, TBEs[in_msg.LineAddress]);
+            }
             if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1 let's see if the L2 has it
               trigger(mandatory_request_type_to_event(in_msg.Type),
diff --git a/src/mem/protocol/MOESI_CMP_token-L1cache.sm b/src/mem/protocol/MOESI_CMP_token-L1cache.sm
index 00e9404c9..226f21374 100644
--- a/src/mem/protocol/MOESI_CMP_token-L1cache.sm
+++ b/src/mem/protocol/MOESI_CMP_token-L1cache.sm
@@ -647,20 +647,21 @@ machine(L1Cache, "Token protocol")
         if (in_msg.Type == CacheRequestType:IFETCH) {
           // ** INSTRUCTION ACCESS ***
 
-          // Check to see if it is in the OTHER L1
-          Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
-          if (is_valid(L1Dcache_entry)) {
-            // The block is in the wrong L1, try to write it to the L2
-              trigger(Event:L1_Replacement, in_msg.LineAddress,
-                      L1Dcache_entry, tbe);
-          }
-
           Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
           if (is_valid(L1Icache_entry)) {
             // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Icache_entry, tbe);
           } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Dcache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+                trigger(Event:L1_Replacement, in_msg.LineAddress,
+                        L1Dcache_entry, tbe);
+            }
+
             if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
               trigger(mandatory_request_type_to_event(in_msg.Type),
@@ -676,21 +677,21 @@ machine(L1Cache, "Token protocol")
         } else {
           // *** DATA ACCESS ***
 
-            // Check to see if it is in the OTHER L1
-          Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
-
-          if (is_valid(L1Icache_entry)) {
-            // The block is in the wrong L1, try to write it to the L2
-            trigger(Event:L1_Replacement, in_msg.LineAddress,
-                    L1Icache_entry, tbe);
-          }
-
           Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
           if (is_valid(L1Dcache_entry)) {
             // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Dcache_entry, tbe);
           } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Icache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+              trigger(Event:L1_Replacement, in_msg.LineAddress,
+                      L1Icache_entry, tbe);
+            }
+
             if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
               trigger(mandatory_request_type_to_event(in_msg.Type),
diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm
index 78bc9e3e7..ab2a6acf4 100644
--- a/src/mem/protocol/MOESI_hammer-cache.sm
+++ b/src/mem/protocol/MOESI_hammer-cache.sm
@@ -377,26 +377,26 @@ machine(L1Cache, "AMD Hammer-like protocol")
         if (in_msg.Type == CacheRequestType:IFETCH) {
           // ** INSTRUCTION ACCESS ***
 
-          // Check to see if it is in the OTHER L1
-          Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
-          if (is_valid(L1Dcache_entry)) {
-            // The block is in the wrong L1, try to write it to the L2
-            if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
-              trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe);
-            } else {
-              trigger(Event:L2_Replacement,
-                      L2cacheMemory.cacheProbe(in_msg.LineAddress),
-                      getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
-                      TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
-            }
-          }
-
           Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
           if (is_valid(L1Icache_entry)) {
             // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Icache_entry, tbe);
           } else {
+            // Check to see if it is in the OTHER L1
+            Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Dcache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+                trigger(Event:L1_to_L2, in_msg.LineAddress, L1Dcache_entry, tbe);
+              } else {
+                trigger(Event:L2_Replacement,
+                        L2cacheMemory.cacheProbe(in_msg.LineAddress),
+                        getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
+                        TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
+              }
+            }
+
             if (L1IcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
 
@@ -430,26 +430,27 @@ machine(L1Cache, "AMD Hammer-like protocol")
         } else {
           // *** DATA ACCESS ***
 
-          // Check to see if it is in the OTHER L1
-          Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
-          if (is_valid(L1Icache_entry)) {
-            // The block is in the wrong L1, try to write it to the L2
-            if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
-              trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe);
-            } else {
-              trigger(Event:L2_Replacement,
-                      L2cacheMemory.cacheProbe(in_msg.LineAddress),
-                      getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
-                      TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
-            }
-          }
-
           Entry L1Dcache_entry := getL1DCacheEntry(in_msg.LineAddress);
           if (is_valid(L1Dcache_entry)) {
             // The tag matches for the L1, so the L1 fetches the line.  We know it can't be in the L2 due to exclusion
             trigger(mandatory_request_type_to_event(in_msg.Type),
                     in_msg.LineAddress, L1Dcache_entry, tbe);
           } else {
+
+            // Check to see if it is in the OTHER L1
+            Entry L1Icache_entry := getL1ICacheEntry(in_msg.LineAddress);
+            if (is_valid(L1Icache_entry)) {
+              // The block is in the wrong L1, try to write it to the L2
+              if (L2cacheMemory.cacheAvail(in_msg.LineAddress)) {
+                trigger(Event:L1_to_L2, in_msg.LineAddress, L1Icache_entry, tbe);
+              } else {
+                trigger(Event:L2_Replacement,
+                        L2cacheMemory.cacheProbe(in_msg.LineAddress),
+                        getL2CacheEntry(L2cacheMemory.cacheProbe(in_msg.LineAddress)),
+                        TBEs[L2cacheMemory.cacheProbe(in_msg.LineAddress)]);
+              }
+            }
+
             if (L1DcacheMemory.cacheAvail(in_msg.LineAddress)) {
               // L1 does't have the line, but we have space for it in the L1
               Entry L2cache_entry := getL2CacheEntry(in_msg.LineAddress);
diff --git a/src/mem/ruby/buffers/MessageBuffer.cc b/src/mem/ruby/buffers/MessageBuffer.cc
index f6b79c580..225595005 100644
--- a/src/mem/ruby/buffers/MessageBuffer.cc
+++ b/src/mem/ruby/buffers/MessageBuffer.cc
@@ -58,6 +58,8 @@ MessageBuffer::MessageBuffer(const string &name)
     m_name = name;
 
     m_stall_msg_map.clear();
+    m_input_link_id = 0;
+    m_vnet_id = 0;
 }
 
 int
@@ -228,6 +230,7 @@ MessageBuffer::enqueue(MsgPtr message, Time delta)
     // Schedule the wakeup
     if (m_consumer_ptr != NULL) {
         g_eventQueue_ptr->scheduleEventAbsolute(m_consumer_ptr, arrival_time);
+        m_consumer_ptr->storeEventInfo(m_vnet_id);
     } else {
         panic("No consumer: %s name: %s\n", *this, m_name);
     }
diff --git a/src/mem/ruby/buffers/MessageBuffer.hh b/src/mem/ruby/buffers/MessageBuffer.hh
index 62cc65670..88df5b788 100644
--- a/src/mem/ruby/buffers/MessageBuffer.hh
+++ b/src/mem/ruby/buffers/MessageBuffer.hh
@@ -142,6 +142,9 @@ class MessageBuffer
     void printStats(std::ostream& out);
     void clearStats() { m_not_avail_count = 0; m_msg_counter = 0; }
 
+    void setIncomingLink(int link_id) { m_input_link_id = link_id; }
+    void setVnet(int net) { m_vnet_id = net; }
+
   private:
     //added by SS
     int m_recycle_latency;
@@ -184,6 +187,9 @@ class MessageBuffer
     bool m_ordering_set;
     bool m_randomization;
     Time m_last_arrival_time;
+
+    int m_input_link_id;
+    int m_vnet_id;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/common/Consumer.hh b/src/mem/ruby/common/Consumer.hh
index c1f8bc42e..a119abb39 100644
--- a/src/mem/ruby/common/Consumer.hh
+++ b/src/mem/ruby/common/Consumer.hh
@@ -67,6 +67,7 @@ class Consumer
 
     virtual void wakeup() = 0;
     virtual void print(std::ostream& out) const = 0;
+    virtual void storeEventInfo(int info) {}
 
     const Time&
     getLastScheduledWakeup() const
diff --git a/src/mem/ruby/network/simple/PerfectSwitch.cc b/src/mem/ruby/network/simple/PerfectSwitch.cc
index 7229c724f..5c461c63f 100644
--- a/src/mem/ruby/network/simple/PerfectSwitch.cc
+++ b/src/mem/ruby/network/simple/PerfectSwitch.cc
@@ -54,6 +54,11 @@ PerfectSwitch::PerfectSwitch(SwitchID sid, SimpleNetwork* network_ptr)
     m_round_robin_start = 0;
     m_network_ptr = network_ptr;
     m_wakeups_wo_switch = 0;
+
+    for(int i = 0;i < m_virtual_networks;++i)
+    {
+        m_pending_message_count.push_back(0);
+    }
 }
 
 void
@@ -62,12 +67,15 @@ PerfectSwitch::addInPort(const vector<MessageBuffer*>& in)
     assert(in.size() == m_virtual_networks);
     NodeID port = m_in.size();
     m_in.push_back(in);
+
     for (int j = 0; j < m_virtual_networks; j++) {
         m_in[port][j]->setConsumer(this);
         string desc = csprintf("[Queue from port %s %s %s to PerfectSwitch]",
             NodeIDToString(m_switch_id), NodeIDToString(port),
             NodeIDToString(j));
         m_in[port][j]->setDescription(desc);
+        m_in[port][j]->setIncomingLink(port);
+        m_in[port][j]->setVnet(j);
     }
 }
 
@@ -154,161 +162,170 @@ PerfectSwitch::wakeup()
             m_round_robin_start = 0;
         }
 
-        // for all input ports, use round robin scheduling
-        for (int counter = 0; counter < m_in.size(); counter++) {
-            // Round robin scheduling
-            incoming++;
-            if (incoming >= m_in.size()) {
-                incoming = 0;
-            }
+        if(m_pending_message_count[vnet] > 0) {
+            // for all input ports, use round robin scheduling
+            for (int counter = 0; counter < m_in.size(); counter++) {
+                // Round robin scheduling
+                incoming++;
+                if (incoming >= m_in.size()) {
+                    incoming = 0;
+                }
 
-            // temporary vectors to store the routing results
-            vector<LinkID> output_links;
-            vector<NetDest> output_link_destinations;
-
-            // Is there a message waiting?
-            while (m_in[incoming][vnet]->isReady()) {
-                DPRINTF(RubyNetwork, "incoming: %d\n", incoming);
-
-                // Peek at message
-                msg_ptr = m_in[incoming][vnet]->peekMsgPtr();
-                net_msg_ptr = safe_cast<NetworkMessage*>(msg_ptr.get());
-                DPRINTF(RubyNetwork, "Message: %s\n", (*net_msg_ptr));
-
-                output_links.clear();
-                output_link_destinations.clear();
-                NetDest msg_dsts =
-                    net_msg_ptr->getInternalDestination();
-
-                // Unfortunately, the token-protocol sends some
-                // zero-destination messages, so this assert isn't valid
-                // assert(msg_dsts.count() > 0);
-
-                assert(m_link_order.size() == m_routing_table.size());
-                assert(m_link_order.size() == m_out.size());
-
-                if (m_network_ptr->getAdaptiveRouting()) {
-                    if (m_network_ptr->isVNetOrdered(vnet)) {
-                        // Don't adaptively route
-                        for (int out = 0; out < m_out.size(); out++) {
-                            m_link_order[out].m_link = out;
-                            m_link_order[out].m_value = 0;
-                        }
-                    } else {
-                        // Find how clogged each link is
-                        for (int out = 0; out < m_out.size(); out++) {
-                            int out_queue_length = 0;
-                            for (int v = 0; v < m_virtual_networks; v++) {
-                                out_queue_length += m_out[out][v]->getSize();
+                // temporary vectors to store the routing results
+                vector<LinkID> output_links;
+                vector<NetDest> output_link_destinations;
+
+                // Is there a message waiting?
+                while (m_in[incoming][vnet]->isReady()) {
+                    DPRINTF(RubyNetwork, "incoming: %d\n", incoming);
+
+                    // Peek at message
+                    msg_ptr = m_in[incoming][vnet]->peekMsgPtr();
+                    net_msg_ptr = safe_cast<NetworkMessage*>(msg_ptr.get());
+                    DPRINTF(RubyNetwork, "Message: %s\n", (*net_msg_ptr));
+
+                    output_links.clear();
+                    output_link_destinations.clear();
+                    NetDest msg_dsts =
+                        net_msg_ptr->getInternalDestination();
+
+                    // Unfortunately, the token-protocol sends some
+                    // zero-destination messages, so this assert isn't valid
+                    // assert(msg_dsts.count() > 0);
+
+                    assert(m_link_order.size() == m_routing_table.size());
+                    assert(m_link_order.size() == m_out.size());
+
+                    if (m_network_ptr->getAdaptiveRouting()) {
+                        if (m_network_ptr->isVNetOrdered(vnet)) {
+                            // Don't adaptively route
+                            for (int out = 0; out < m_out.size(); out++) {
+                                m_link_order[out].m_link = out;
+                                m_link_order[out].m_value = 0;
+                            }
+                        } else {
+                            // Find how clogged each link is
+                            for (int out = 0; out < m_out.size(); out++) {
+                                int out_queue_length = 0;
+                                for (int v = 0; v < m_virtual_networks; v++) {
+                                    out_queue_length += m_out[out][v]->getSize();
+                                }
+                                int value =
+                                    (out_queue_length << 8) | (random() & 0xff);
+                                m_link_order[out].m_link = out;
+                                m_link_order[out].m_value = value;
                             }
-                            int value =
-                                (out_queue_length << 8) | (random() & 0xff);
-                            m_link_order[out].m_link = out;
-                            m_link_order[out].m_value = value;
+
+                            // Look at the most empty link first
+                            sort(m_link_order.begin(), m_link_order.end());
                         }
+                    }
 
-                        // Look at the most empty link first
-                        sort(m_link_order.begin(), m_link_order.end());
+                    for (int i = 0; i < m_routing_table.size(); i++) {
+                        // pick the next link to look at
+                        int link = m_link_order[i].m_link;
+                        NetDest dst = m_routing_table[link];
+                        DPRINTF(RubyNetwork, "dst: %s\n", dst);
+
+                        if (!msg_dsts.intersectionIsNotEmpty(dst))
+                            continue;
+
+                        // Remember what link we're using
+                        output_links.push_back(link);
+
+                        // Need to remember which destinations need this
+                        // message in another vector.  This Set is the
+                        // intersection of the routing_table entry and the
+                        // current destination set.  The intersection must
+                        // not be empty, since we are inside "if"
+                        output_link_destinations.push_back(msg_dsts.AND(dst));
+
+                        // Next, we update the msg_destination not to
+                        // include those nodes that were already handled
+                        // by this link
+                        msg_dsts.removeNetDest(dst);
                     }
-                }
 
-                for (int i = 0; i < m_routing_table.size(); i++) {
-                    // pick the next link to look at
-                    int link = m_link_order[i].m_link;
-                    NetDest dst = m_routing_table[link];
-                    DPRINTF(RubyNetwork, "dst: %s\n", dst);
-
-                    if (!msg_dsts.intersectionIsNotEmpty(dst))
-                        continue;
-
-                    // Remember what link we're using
-                    output_links.push_back(link);
-
-                    // Need to remember which destinations need this
-                    // message in another vector.  This Set is the
-                    // intersection of the routing_table entry and the
-                    // current destination set.  The intersection must
-                    // not be empty, since we are inside "if"
-                    output_link_destinations.push_back(msg_dsts.AND(dst));
-
-                    // Next, we update the msg_destination not to
-                    // include those nodes that were already handled
-                    // by this link
-                    msg_dsts.removeNetDest(dst);
-                }
+                    assert(msg_dsts.count() == 0);
+                    //assert(output_links.size() > 0);
+
+                    // Check for resources - for all outgoing queues
+                    bool enough = true;
+                    for (int i = 0; i < output_links.size(); i++) {
+                        int outgoing = output_links[i];
+                        if (!m_out[outgoing][vnet]->areNSlotsAvailable(1))
+                            enough = false;
+                        DPRINTF(RubyNetwork, "Checking if node is blocked\n"
+                                "outgoing: %d, vnet: %d, enough: %d\n",
+                                outgoing, vnet, enough);
+                    }
 
-                assert(msg_dsts.count() == 0);
-                //assert(output_links.size() > 0);
-
-                // Check for resources - for all outgoing queues
-                bool enough = true;
-                for (int i = 0; i < output_links.size(); i++) {
-                    int outgoing = output_links[i];
-                    if (!m_out[outgoing][vnet]->areNSlotsAvailable(1))
-                        enough = false;
-                    DPRINTF(RubyNetwork, "Checking if node is blocked\n"
-                            "outgoing: %d, vnet: %d, enough: %d\n",
-                            outgoing, vnet, enough);
-                }
+                    // There were not enough resources
+                    if (!enough) {
+                        g_eventQueue_ptr->scheduleEvent(this, 1);
+                        DPRINTF(RubyNetwork, "Can't deliver message since a node "
+                                "is blocked\n"
+                                "Message: %s\n", (*net_msg_ptr));
+                        break; // go to next incoming port
+                    }
 
-                // There were not enough resources
-                if (!enough) {
-                    g_eventQueue_ptr->scheduleEvent(this, 1);
-                    DPRINTF(RubyNetwork, "Can't deliver message since a node "
-                            "is blocked\n"
-                            "Message: %s\n", (*net_msg_ptr));
-                    break; // go to next incoming port
-                }
+                    MsgPtr unmodified_msg_ptr;
 
-                MsgPtr unmodified_msg_ptr;
+                    if (output_links.size() > 1) {
+                        // If we are sending this message down more than
+                        // one link (size>1), we need to make a copy of
+                        // the message so each branch can have a different
+                        // internal destination we need to create an
+                        // unmodified MsgPtr because the MessageBuffer
+                        // enqueue func will modify the message
 
-                if (output_links.size() > 1) {
-                    // If we are sending this message down more than
-                    // one link (size>1), we need to make a copy of
-                    // the message so each branch can have a different
-                    // internal destination we need to create an
-                    // unmodified MsgPtr because the MessageBuffer
-                    // enqueue func will modify the message
+                        // This magic line creates a private copy of the
+                        // message
+                        unmodified_msg_ptr = msg_ptr->clone();
+                    }
 
-                    // This magic line creates a private copy of the
-                    // message
-                    unmodified_msg_ptr = msg_ptr->clone();
-                }
+                    // Enqueue it - for all outgoing queues
+                    for (int i=0; i<output_links.size(); i++) {
+                        int outgoing = output_links[i];
 
-                // Enqueue it - for all outgoing queues
-                for (int i=0; i<output_links.size(); i++) {
-                    int outgoing = output_links[i];
+                        if (i > 0) {
+                            // create a private copy of the unmodified
+                            // message
+                            msg_ptr = unmodified_msg_ptr->clone();
+                        }
 
-                    if (i > 0) {
-                        // create a private copy of the unmodified
-                        // message
-                        msg_ptr = unmodified_msg_ptr->clone();
-                    }
+                        // Change the internal destination set of the
+                        // message so it knows which destinations this
+                        // link is responsible for.
+                        net_msg_ptr = safe_cast<NetworkMessage*>(msg_ptr.get());
+                        net_msg_ptr->getInternalDestination() =
+                            output_link_destinations[i];
 
-                    // Change the internal destination set of the
-                    // message so it knows which destinations this
-                    // link is responsible for.
-                    net_msg_ptr = safe_cast<NetworkMessage*>(msg_ptr.get());
-                    net_msg_ptr->getInternalDestination() =
-                        output_link_destinations[i];
+                        // Enqeue msg
+                        DPRINTF(RubyNetwork, "Switch: %d enqueuing net msg from "
+                                "inport[%d][%d] to outport [%d][%d] time: %lld.\n",
+                                m_switch_id, incoming, vnet, outgoing, vnet,
+                                g_eventQueue_ptr->getTime());
 
-                    // Enqeue msg
-                    DPRINTF(RubyNetwork, "Switch: %d enqueuing net msg from "
-                            "inport[%d][%d] to outport [%d][%d] time: %lld.\n",
-                            m_switch_id, incoming, vnet, outgoing, vnet,
-                            g_eventQueue_ptr->getTime());
+                        m_out[outgoing][vnet]->enqueue(msg_ptr);
+                    }
 
-                    m_out[outgoing][vnet]->enqueue(msg_ptr);
+                    // Dequeue msg
+                    m_in[incoming][vnet]->pop();
+                    m_pending_message_count[vnet]--;
                 }
-
-                // Dequeue msg
-                m_in[incoming][vnet]->pop();
             }
         }
     }
 }
 
 void
+PerfectSwitch::storeEventInfo(int info)
+{
+    m_pending_message_count[info]++;
+}
+
+void
 PerfectSwitch::printStats(std::ostream& out) const
 {
     out << "PerfectSwitch printStats" << endl;
diff --git a/src/mem/ruby/network/simple/PerfectSwitch.hh b/src/mem/ruby/network/simple/PerfectSwitch.hh
index a7e577df0..cd0219fd9 100644
--- a/src/mem/ruby/network/simple/PerfectSwitch.hh
+++ b/src/mem/ruby/network/simple/PerfectSwitch.hh
@@ -69,6 +69,7 @@ class PerfectSwitch : public Consumer
     int getOutLinks() const { return m_out.size(); }
 
     void wakeup();
+    void storeEventInfo(int info);
 
     void printStats(std::ostream& out) const;
     void clearStats();
@@ -92,6 +93,7 @@ class PerfectSwitch : public Consumer
     int m_round_robin_start;
     int m_wakeups_wo_switch;
     SimpleNetwork* m_network_ptr;
+    std::vector<int> m_pending_message_count;
 };
 
 inline std::ostream&
diff --git a/src/mem/ruby/slicc_interface/Message.hh b/src/mem/ruby/slicc_interface/Message.hh
index ff94fdd40..7fcfabe9c 100644
--- a/src/mem/ruby/slicc_interface/Message.hh
+++ b/src/mem/ruby/slicc_interface/Message.hh
@@ -57,6 +57,8 @@ class Message : public RefCounted
 
     virtual Message* clone() const = 0;
     virtual void print(std::ostream& out) const = 0;
+    virtual void setIncomingLink(int) {}
+    virtual void setVnet(int) {}
 
     void setDelayedCycles(const int& cycles) { m_DelayedCycles = cycles; }
     const int& getDelayedCycles() const {return m_DelayedCycles;}
diff --git a/src/mem/ruby/slicc_interface/NetworkMessage.hh b/src/mem/ruby/slicc_interface/NetworkMessage.hh
index 082481e05..a8f9c625b 100644
--- a/src/mem/ruby/slicc_interface/NetworkMessage.hh
+++ b/src/mem/ruby/slicc_interface/NetworkMessage.hh
@@ -82,9 +82,16 @@ class NetworkMessage : public Message
 
     virtual void print(std::ostream& out) const = 0;
 
+    int getIncomingLink() const { return incoming_link; }
+    void setIncomingLink(int link) { incoming_link = link; }
+    int getVnet() const { return vnet; }
+    void setVnet(int net) { vnet = net; }
+
   private:
     NetDest m_internal_dest;
     bool m_internal_dest_valid;
+    int incoming_link;
+    int vnet;
 };
 
 inline std::ostream&
diff --git a/src/python/m5/main.py b/src/python/m5/main.py
index cd139ccb3..23a012166 100644
--- a/src/python/m5/main.py
+++ b/src/python/m5/main.py
@@ -61,8 +61,6 @@ add_option('-C', "--copyright", action="store_true", default=False,
     help="Show full copyright information")
 add_option('-R', "--readme", action="store_true", default=False,
     help="Show the readme")
-add_option('-N', "--release-notes", action="store_true", default=False,
-    help="Show the release notes")
 
 # Options for configuring the base simulator
 add_option('-d', "--outdir", metavar="DIR", default="m5out",
@@ -207,13 +205,6 @@ def main():
         print info.README
         print
 
-    if options.release_notes:
-        done = True
-        print 'Release Notes:'
-        print
-        print info.RELEASE_NOTES
-        print
-
     if options.trace_help:
         done = True
         check_tracing()