gpu-compute: AMD's baseline GPU model

author: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-01-19 14:28:22 -0500
committer: Tony Gutierrez <anthony.gutierrez@amd.com> 2016-01-19 14:28:22 -0500
commit: 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree: 867510a147cd095f19499d26b7c02d27de4cae9d /src
parent: 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
download: gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
148 files changed, 52249 insertions, 80 deletions
diff --git a/src/SConscript b/src/SConscript
index 322212cb7..2bac0bff3 100755
--- a/src/SConscript
+++ b/src/SConscript
@@ -78,7 +78,7 @@ class SourceMeta(type):
     def __init__(cls, name, bases, dict):
         super(SourceMeta, cls).__init__(name, bases, dict)
         cls.all = []
-        
+
     def get(cls, **guards):
         '''Find all files that match the specified guards.  If a source
         file does not specify a flag, the default is False'''
@@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
     target_isa = env['TARGET_ISA']
     def define(isa):
         return isa.upper() + '_ISA'
-    
+
     def namespace(isa):
-        return isa[0].upper() + isa[1:].lower() + 'ISA' 
+        return isa[0].upper() + isa[1:].lower() + 'ISA'
 
 
     code = code_formatter()
@@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
 env.Command('config/the_isa.hh', map(Value, all_isa_list),
             MakeAction(makeTheISA, Transform("CFG ISA", 0)))
 
+def makeTheGPUISA(source, target, env):
+    isas = [ src.get_contents() for src in source ]
+    target_gpu_isa = env['TARGET_GPU_ISA']
+    def define(isa):
+        return isa.upper() + '_ISA'
+
+    def namespace(isa):
+        return isa[0].upper() + isa[1:].lower() + 'ISA'
+
+
+    code = code_formatter()
+    code('''\
+#ifndef __CONFIG_THE_GPU_ISA_HH__
+#define __CONFIG_THE_GPU_ISA_HH__
+
+''')
+
+    # create defines for the preprocessing and compile-time determination
+    for i,isa in enumerate(isas):
+        code('#define $0 $1', define(isa), i + 1)
+    code()
+
+    # create an enum for any run-time determination of the ISA, we
+    # reuse the same name as the namespaces
+    code('enum class GPUArch {')
+    for i,isa in enumerate(isas):
+        if i + 1 == len(isas):
+            code('  $0 = $1', namespace(isa), define(isa))
+        else:
+            code('  $0 = $1,', namespace(isa), define(isa))
+    code('};')
+
+    code('''
+
+#define THE_GPU_ISA ${{define(target_gpu_isa)}}
+#define TheGpuISA ${{namespace(target_gpu_isa)}}
+#define THE_GPU_ISA_STR "${{target_gpu_isa}}"
+
+#endif // __CONFIG_THE_GPU_ISA_HH__''')
+
+    code.write(str(target[0]))
+
+env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
+            MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
+
 ########################################################################
 #
 # Prevent any SimObjects from being added after this point, they
@@ -784,7 +829,7 @@ extern "C" {
 EmbeddedSwig embed_swig_${module}(init_${module});
 ''')
     code.write(str(target[0]))
-    
+
 # Build all swig modules
 for swig in SwigSource.all:
     env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
@@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
         x = array.array('B', data[i:i+step])
         code(''.join('%d,' % d for d in x))
     code.dedent()
-    
+
     code('''};
 
 EmbeddedPython embedded_${sym}(
diff --git a/src/arch/SConscript b/src/arch/SConscript
index e0d6845f5..b022cb01f 100644
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
 # Set up this directory to support switching headers
 make_switching_dir('arch', isa_switch_hdrs, env)
 
+if env['BUILD_GPU']:
+    gpu_isa_switch_hdrs = Split('''
+            gpu_decoder.hh
+            gpu_types.hh
+            ''')
+
+    make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
+
 #################################################################
 #
 # Include architecture-specific files.
diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h
new file mode 100644
index 000000000..b260157ab
--- /dev/null
+++ b/src/arch/hsail/Brig.h
@@ -0,0 +1,67 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+#ifndef INTERNAL_BRIG_H
+#define INTERNAL_BRIG_H
+
+#include <stdint.h>
+
+namespace Brig {
+#include "Brig_new.hpp"
+
+// These typedefs provide some backward compatibility with earlier versions
+// of Brig.h, reducing the number of code changes. The distinct names also
+// increase legibility by showing the code's intent.
+typedef BrigBase BrigDirective;
+typedef BrigBase BrigOperand;
+
+enum BrigMemoryFenceSegments { // for internal use only
+    //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
+    //.mnemo_token=_EMMemoryFenceSegments
+    //.mnemo_context=EInstModifierInstFenceContext
+    BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
+    BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
+    BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
+    BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
+};
+
+}
+
+#endif // defined(INTERNAL_BRIG_H)
diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp
new file mode 100644
index 000000000..60e6f4dea
--- /dev/null
+++ b/src/arch/hsail/Brig_new.hpp
@@ -0,0 +1,1587 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+//     HSA Team
+//
+//     Advanced Micro Devices, Inc
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//     * Redistributions of source code must retain the above copyright notice,
+//       this list of conditions and the following disclaimers.
+//
+//     * Redistributions in binary form must reproduce the above copyright notice,
+//       this list of conditions and the following disclaimers in the
+//       documentation and/or other materials provided with the distribution.
+//
+//     * Neither the names of the LLVM Team, University of Illinois at
+//       Urbana-Champaign, nor the names of its contributors may be used to
+//       endorse or promote products derived from this Software without specific
+//       prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+//.ignore{
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stdint.h>
+
+enum BrigAuxDefs {
+  MAX_OPERANDS_NUM = 6
+};
+
+//}
+
+typedef uint32_t BrigVersion32_t;
+
+enum BrigVersion {
+
+    //.nowrap
+    //.nodump
+    //.nollvm
+
+    BRIG_VERSION_HSAIL_MAJOR = 1,
+    BRIG_VERSION_HSAIL_MINOR = 0,
+    BRIG_VERSION_BRIG_MAJOR  = 1,
+    BRIG_VERSION_BRIG_MINOR  = 0
+};
+
+typedef uint8_t BrigAlignment8_t;                           //.defValue=BRIG_ALIGNMENT_NONE
+
+typedef uint8_t BrigAllocation8_t;                          //.defValue=BRIG_ALLOCATION_NONE
+
+typedef uint8_t BrigAluModifier8_t;
+
+typedef uint8_t BrigAtomicOperation8_t;
+
+typedef uint32_t BrigCodeOffset32_t;                        //.defValue=0   //.wtype=ItemRef<Code>
+
+typedef uint8_t BrigCompareOperation8_t;
+
+typedef uint16_t BrigControlDirective16_t;
+
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t;      //.wtype=ListRef<Code>      //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t;   //.wtype=ListRef<Operand>   //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetString32_t;        //.wtype=StrRef             //.defValue=0
+
+typedef uint8_t BrigExecutableModifier8_t;
+
+typedef uint8_t BrigImageChannelOrder8_t;                   //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN
+
+typedef uint8_t BrigImageChannelType8_t;                    //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN
+
+typedef uint8_t BrigImageGeometry8_t;                       //.defValue=BRIG_GEOMETRY_UNKNOWN
+
+typedef uint8_t BrigImageQuery8_t;
+
+typedef uint16_t BrigKind16_t;
+
+typedef uint8_t BrigLinkage8_t;                             //.defValue=BRIG_LINKAGE_NONE
+
+typedef uint8_t BrigMachineModel8_t;                        //.defValue=BRIG_MACHINE_LARGE
+
+typedef uint8_t BrigMemoryModifier8_t;
+
+typedef uint8_t BrigMemoryOrder8_t;                         //.defValue=BRIG_MEMORY_ORDER_RELAXED
+
+typedef uint8_t BrigMemoryScope8_t;                         //.defValue=BRIG_MEMORY_SCOPE_SYSTEM
+
+typedef uint16_t BrigOpcode16_t;
+
+typedef uint32_t BrigOperandOffset32_t;                     //.defValue=0 //.wtype=ItemRef<Operand>
+
+typedef uint8_t BrigPack8_t;                                //.defValue=BRIG_PACK_NONE
+
+typedef uint8_t BrigProfile8_t;                             //.defValue=BRIG_PROFILE_FULL
+
+typedef uint16_t BrigRegisterKind16_t;
+
+typedef uint8_t BrigRound8_t;                               //.defValue=BRIG_ROUND_NONE
+
+typedef uint8_t BrigSamplerAddressing8_t;                   //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+
+typedef uint8_t BrigSamplerFilter8_t;
+
+typedef uint8_t BrigSamplerQuery8_t;
+
+typedef uint32_t BrigSectionIndex32_t;
+
+typedef uint8_t BrigSegCvtModifier8_t;
+
+typedef uint8_t BrigSegment8_t;                             //.defValue=BRIG_SEGMENT_NONE
+
+typedef uint32_t BrigStringOffset32_t;                      //.defValue=0       //.wtype=StrRef
+
+typedef uint16_t BrigType16_t;
+
+typedef uint8_t BrigVariableModifier8_t;
+
+typedef uint8_t BrigWidth8_t;
+
+typedef uint32_t BrigExceptions32_t;
+
+enum BrigKind {
+
+    //.nollvm
+    //
+    //.wname={ s/^BRIG_KIND//; MACRO2Name($_) }
+    //.mnemo=$wname{ $wname }
+    //
+    //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" }
+    //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1"
+    //
+    //.isBodyOnly={ "false" }
+    //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()"
+    //.isBodyOnly_default="assert(false); return false"
+    //
+    //.isToplevelOnly={ "false" }
+    //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()"
+    //.isToplevelOnly_default="assert(false); return false"
+
+    BRIG_KIND_NONE = 0x0000,                        //.skip
+
+    BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,             //.skip
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000,     //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001,   //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+    BRIG_KIND_DIRECTIVE_CONTROL = 0x1003,           //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+    BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006,          //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_LABEL = 0x1009,             //.isBodyOnly=true
+    BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+    BRIG_KIND_DIRECTIVE_MODULE = 0x100b,            //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+    BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d,         //.isToplevelOnly=true
+    BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+    BRIG_KIND_DIRECTIVE_END = 0x100f,               //.skip
+
+    BRIG_KIND_INST_BEGIN = 0x2000,                  //.skip
+    BRIG_KIND_INST_ADDR = 0x2000,
+    BRIG_KIND_INST_ATOMIC = 0x2001,
+    BRIG_KIND_INST_BASIC = 0x2002,
+    BRIG_KIND_INST_BR = 0x2003,
+    BRIG_KIND_INST_CMP = 0x2004,
+    BRIG_KIND_INST_CVT = 0x2005,
+    BRIG_KIND_INST_IMAGE = 0x2006,
+    BRIG_KIND_INST_LANE = 0x2007,
+    BRIG_KIND_INST_MEM = 0x2008,
+    BRIG_KIND_INST_MEM_FENCE = 0x2009,
+    BRIG_KIND_INST_MOD = 0x200a,
+    BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+    BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+    BRIG_KIND_INST_QUEUE = 0x200d,
+    BRIG_KIND_INST_SEG = 0x200e,
+    BRIG_KIND_INST_SEG_CVT = 0x200f,
+    BRIG_KIND_INST_SIGNAL = 0x2010,
+    BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+    BRIG_KIND_INST_END = 0x2012,                    //.skip
+
+    BRIG_KIND_OPERAND_BEGIN = 0x3000,               //.skip
+    BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+    BRIG_KIND_OPERAND_ALIGN = 0x3001,
+    BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+    BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+    BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+    BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip
+    BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+    BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+    BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+    BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+    BRIG_KIND_OPERAND_REGISTER = 0x300a,
+    BRIG_KIND_OPERAND_STRING = 0x300b,
+    BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+    BRIG_KIND_OPERAND_END = 0x300d                  //.skip
+};
+
+enum BrigAlignment {
+
+    //.mnemo={ s/^BRIG_ALIGNMENT_//; lc }
+    //.mnemo_proto="const char* align2str(unsigned arg)"
+    //
+    //.bytes={ /(\d+)/ ? $1 : undef }
+    //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1"
+    //
+    //.rbytes=$bytes{ $bytes }
+    //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)"
+    //.rbytes_default="return BRIG_ALIGNMENT_LAST"
+    //
+    //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" }
+
+    BRIG_ALIGNMENT_NONE = 0,                        //.no_mnemo
+    BRIG_ALIGNMENT_1 = 1,                           //.mnemo=""
+    BRIG_ALIGNMENT_2 = 2,
+    BRIG_ALIGNMENT_4 = 3,
+    BRIG_ALIGNMENT_8 = 4,
+    BRIG_ALIGNMENT_16 = 5,
+    BRIG_ALIGNMENT_32 = 6,
+    BRIG_ALIGNMENT_64 = 7,
+    BRIG_ALIGNMENT_128 = 8,
+    BRIG_ALIGNMENT_256 = 9,
+
+    BRIG_ALIGNMENT_LAST,                            //.skip
+    BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1    //.skip
+};
+
+enum BrigAllocation {
+
+    //.mnemo={ s/^BRIG_ALLOCATION_//;lc }
+    //.mnemo_token=EAllocKind
+
+    BRIG_ALLOCATION_NONE = 0,       //.mnemo=""
+    BRIG_ALLOCATION_PROGRAM = 1,
+    BRIG_ALLOCATION_AGENT = 2,
+    BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+enum BrigAluModifierMask {
+    BRIG_ALU_FTZ = 1
+};
+
+enum BrigAtomicOperation {
+
+    //.tdcaption="Atomic Operations"
+    //
+    //.mnemo={ s/^BRIG_ATOMIC_//;lc }
+    //.mnemo_token=_EMAtomicOp
+    //.mnemo_context=EInstModifierInstAtomicContext
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ATOMIC_ADD = 0,
+    BRIG_ATOMIC_AND = 1,
+    BRIG_ATOMIC_CAS = 2,
+    BRIG_ATOMIC_EXCH = 3,
+    BRIG_ATOMIC_LD = 4,
+    BRIG_ATOMIC_MAX = 5,
+    BRIG_ATOMIC_MIN = 6,
+    BRIG_ATOMIC_OR = 7,
+    BRIG_ATOMIC_ST = 8,
+    BRIG_ATOMIC_SUB = 9,
+    BRIG_ATOMIC_WRAPDEC = 10,
+    BRIG_ATOMIC_WRAPINC = 11,
+    BRIG_ATOMIC_XOR = 12,
+    BRIG_ATOMIC_WAIT_EQ = 13,
+    BRIG_ATOMIC_WAIT_NE = 14,
+    BRIG_ATOMIC_WAIT_LT = 15,
+    BRIG_ATOMIC_WAIT_GTE = 16,
+    BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+    BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+    BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+    BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+enum BrigCompareOperation {
+
+    //.tdcaption="Comparison Operators"
+    //
+    //.mnemo={ s/^BRIG_COMPARE_//;lc }
+    //.mnemo_token=_EMCompare
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_COMPARE_EQ = 0,
+    BRIG_COMPARE_NE = 1,
+    BRIG_COMPARE_LT = 2,
+    BRIG_COMPARE_LE = 3,
+    BRIG_COMPARE_GT = 4,
+    BRIG_COMPARE_GE = 5,
+    BRIG_COMPARE_EQU = 6,
+    BRIG_COMPARE_NEU = 7,
+    BRIG_COMPARE_LTU = 8,
+    BRIG_COMPARE_LEU = 9,
+    BRIG_COMPARE_GTU = 10,
+    BRIG_COMPARE_GEU = 11,
+    BRIG_COMPARE_NUM = 12,
+    BRIG_COMPARE_NAN = 13,
+    BRIG_COMPARE_SEQ = 14,
+    BRIG_COMPARE_SNE = 15,
+    BRIG_COMPARE_SLT = 16,
+    BRIG_COMPARE_SLE = 17,
+    BRIG_COMPARE_SGT = 18,
+    BRIG_COMPARE_SGE = 19,
+    BRIG_COMPARE_SGEU = 20,
+    BRIG_COMPARE_SEQU = 21,
+    BRIG_COMPARE_SNEU = 22,
+    BRIG_COMPARE_SLTU = 23,
+    BRIG_COMPARE_SLEU = 24,
+    BRIG_COMPARE_SNUM = 25,
+    BRIG_COMPARE_SNAN = 26,
+    BRIG_COMPARE_SGTU = 27
+};
+
+enum BrigControlDirective {
+
+    //.mnemo={ s/^BRIG_CONTROL_//;lc }
+    //.mnemo_token=EControl
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CONTROL_NONE = 0, //.skip
+    BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+    BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+    BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+    BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+    BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+    BRIG_CONTROL_REQUIREDDIM = 6,
+    BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+    BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+    BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+enum BrigExecutableModifierMask {
+    //.nodump
+    BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+enum BrigImageChannelOrder {
+
+    //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc }
+    //.mnemo_token=EImageOrder
+    //.mnemo_context=EImageOrderContext
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_ORDER_A = 0,
+    BRIG_CHANNEL_ORDER_R = 1,
+    BRIG_CHANNEL_ORDER_RX = 2,
+    BRIG_CHANNEL_ORDER_RG = 3,
+    BRIG_CHANNEL_ORDER_RGX = 4,
+    BRIG_CHANNEL_ORDER_RA = 5,
+    BRIG_CHANNEL_ORDER_RGB = 6,
+    BRIG_CHANNEL_ORDER_RGBX = 7,
+    BRIG_CHANNEL_ORDER_RGBA = 8,
+    BRIG_CHANNEL_ORDER_BGRA = 9,
+    BRIG_CHANNEL_ORDER_ARGB = 10,
+    BRIG_CHANNEL_ORDER_ABGR = 11,
+    BRIG_CHANNEL_ORDER_SRGB = 12,
+    BRIG_CHANNEL_ORDER_SRGBX = 13,
+    BRIG_CHANNEL_ORDER_SRGBA = 14,
+    BRIG_CHANNEL_ORDER_SBGRA = 15,
+    BRIG_CHANNEL_ORDER_INTENSITY = 16,
+    BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+    BRIG_CHANNEL_ORDER_DEPTH = 18,
+    BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+    // used internally
+    BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified
+
+    BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip
+
+};
+
+enum BrigImageChannelType {
+
+    //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc }
+    //.mnemo_token=EImageFormat
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+    BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+    BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+    BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+    BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+    BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+    BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+    BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+    BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+    BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+    BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+    BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+    BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+    // used internally
+    BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo=""
+
+    BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageGeometry {
+
+    //.tdcaption="Geometry"
+    //
+    //.mnemo={ s/^BRIG_GEOMETRY_//;lc }
+    //.mnemo_token=EImageGeometry
+    //
+    //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef}
+    //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo"
+    //.dim_default="assert(0); return 0"
+    //
+    //.depth={/DEPTH$/?"true":"false"}
+    //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo"
+    //.depth_default="return false"
+
+    BRIG_GEOMETRY_1D = 0,
+    BRIG_GEOMETRY_2D = 1,
+    BRIG_GEOMETRY_3D = 2,
+    BRIG_GEOMETRY_1DA = 3,
+    BRIG_GEOMETRY_2DA = 4,
+    BRIG_GEOMETRY_1DB = 5,
+    BRIG_GEOMETRY_2DDEPTH = 6,
+    BRIG_GEOMETRY_2DADEPTH = 7,
+
+    // used internally
+    BRIG_GEOMETRY_UNKNOWN, //.mnemo=""
+
+    BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageQuery {
+
+    //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_IMAGE_QUERY_WIDTH = 0,
+    BRIG_IMAGE_QUERY_HEIGHT = 1,
+    BRIG_IMAGE_QUERY_DEPTH = 2,
+    BRIG_IMAGE_QUERY_ARRAY = 3,
+    BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+    BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+    BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6
+};
+
+enum BrigLinkage {
+
+    //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc }
+
+    BRIG_LINKAGE_NONE = 0,
+    BRIG_LINKAGE_PROGRAM = 1,
+    BRIG_LINKAGE_MODULE = 2,
+    BRIG_LINKAGE_FUNCTION = 3,
+    BRIG_LINKAGE_ARG = 4
+};
+
+enum BrigMachineModel {
+
+    //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc }
+    //.mnemo_token=ETargetMachine
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MACHINE_SMALL = 0,
+    BRIG_MACHINE_LARGE = 1,
+
+    BRIG_MACHINE_UNDEF = 2 //.skip
+};
+
+enum BrigMemoryModifierMask { //.tddef=0
+    BRIG_MEMORY_CONST = 1
+};
+
+enum BrigMemoryOrder {
+
+    //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc }
+    //.mnemo_token=_EMMemoryOrder
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_MEMORY_ORDER_NONE = 0,                 //.mnemo=""
+    BRIG_MEMORY_ORDER_RELAXED = 1,              //.mnemo=rlx
+    BRIG_MEMORY_ORDER_SC_ACQUIRE = 2,           //.mnemo=scacq
+    BRIG_MEMORY_ORDER_SC_RELEASE = 3,           //.mnemo=screl
+    BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4,   //.mnemo=scar
+
+    BRIG_MEMORY_ORDER_LAST = 5 //.skip
+};
+
+enum BrigMemoryScope {
+
+    //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc }
+    //.mnemo_token=_EMMemoryScope
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_MEMORY_SCOPE_NONE = 0,         //.mnemo=""
+    BRIG_MEMORY_SCOPE_WORKITEM = 1,     //.mnemo=""
+    BRIG_MEMORY_SCOPE_WAVEFRONT = 2,    //.mnemo=wave
+    BRIG_MEMORY_SCOPE_WORKGROUP = 3,    //.mnemo=wg
+    BRIG_MEMORY_SCOPE_AGENT = 4,        //.mnemo=agent
+    BRIG_MEMORY_SCOPE_SYSTEM = 5,       //.mnemo=system
+
+    BRIG_MEMORY_SCOPE_LAST = 6 //.skip
+};
+
+enum BrigOpcode {
+
+    //.tdcaption="Instruction Opcodes"
+    //
+    //.k={ "BASIC" }
+    //.pscode=$k{ MACRO2Name("_".$k) }
+    //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" }
+    //.opcodeparser_incfile=ParserUtilities
+    //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic"
+    //
+    //.psopnd={undef}
+    //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" }
+    //.opndparser_incfile=ParserUtilities
+    //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands"
+    //
+    //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc }
+    //.mnemo_scanner=Instructions //.mnemo_token=EInstruction
+    //.mnemo_context=EDefaultContext
+    //
+    //.has_memory_order={undef}
+    //.semsupport=$has_memory_order{ return $has_memory_order && "true" }
+    //
+    //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; }
+    //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true"
+    //
+    //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" }
+    //.opcodevis_switch //.opcodevis_proto="template <typename RetType, typename Visitor> RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)"
+    //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()"
+    //.opcodevis_incfile=ItemUtils
+    //
+    //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef }
+    //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false"
+    //
+    //.vecOpndIndex={undef}
+    //.vecOpndIndex_switch  //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1"
+    //.vecOpndIndex_incfile=ParserUtilities
+    //
+    //.numdst={undef}
+    //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1"
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_OPCODE_NOP = 0,                    //.k=NOP            //.hasType=false
+    BRIG_OPCODE_ABS = 1,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_ADD = 2,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_BORROW = 3,
+    BRIG_OPCODE_CARRY = 4,
+    BRIG_OPCODE_CEIL = 5,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_COPYSIGN = 6,               //.k=BASIC_OR_MOD
+    BRIG_OPCODE_DIV = 7,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FLOOR = 8,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FMA = 9,                    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_FRACT = 10,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD = 11,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAX = 12,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MIN = 13,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MUL = 14,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MULHI = 15,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_NEG = 16,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_REM = 17,
+    BRIG_OPCODE_RINT = 18,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SQRT = 19,                  //.k=BASIC_OR_MOD
+    BRIG_OPCODE_SUB = 20,                   //.k=BASIC_OR_MOD
+    BRIG_OPCODE_TRUNC = 21,                 //.k=BASIC_OR_MOD
+    BRIG_OPCODE_MAD24 = 22,
+    BRIG_OPCODE_MAD24HI = 23,
+    BRIG_OPCODE_MUL24 = 24,
+    BRIG_OPCODE_MUL24HI = 25,
+    BRIG_OPCODE_SHL = 26,
+    BRIG_OPCODE_SHR = 27,
+    BRIG_OPCODE_AND = 28,
+    BRIG_OPCODE_NOT = 29,
+    BRIG_OPCODE_OR = 30,
+    BRIG_OPCODE_POPCOUNT = 31,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_XOR = 32,
+    BRIG_OPCODE_BITEXTRACT = 33,
+    BRIG_OPCODE_BITINSERT = 34,
+    BRIG_OPCODE_BITMASK = 35,
+    BRIG_OPCODE_BITREV = 36,
+    BRIG_OPCODE_BITSELECT = 37,
+    BRIG_OPCODE_FIRSTBIT = 38,              //.k=SOURCE_TYPE
+    BRIG_OPCODE_LASTBIT = 39,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_COMBINE = 40,               //.k=SOURCE_TYPE    //.vecOpndIndex=1
+    BRIG_OPCODE_EXPAND = 41,                //.k=SOURCE_TYPE    //.vecOpndIndex=0
+    BRIG_OPCODE_LDA = 42,                   //.k=ADDR
+    BRIG_OPCODE_MOV = 43,
+    BRIG_OPCODE_SHUFFLE = 44,
+    BRIG_OPCODE_UNPACKHI = 45,
+    BRIG_OPCODE_UNPACKLO = 46,
+    BRIG_OPCODE_PACK = 47,                  //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACK = 48,                //.k=SOURCE_TYPE
+    BRIG_OPCODE_CMOV = 49,
+    BRIG_OPCODE_CLASS = 50,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_NCOS = 51,
+    BRIG_OPCODE_NEXP2 = 52,
+    BRIG_OPCODE_NFMA = 53,
+    BRIG_OPCODE_NLOG2 = 54,
+    BRIG_OPCODE_NRCP = 55,
+    BRIG_OPCODE_NRSQRT = 56,
+    BRIG_OPCODE_NSIN = 57,
+    BRIG_OPCODE_NSQRT = 58,
+    BRIG_OPCODE_BITALIGN = 59,
+    BRIG_OPCODE_BYTEALIGN = 60,
+    BRIG_OPCODE_PACKCVT = 61,               //.k=SOURCE_TYPE
+    BRIG_OPCODE_UNPACKCVT = 62,             //.k=SOURCE_TYPE
+    BRIG_OPCODE_LERP = 63,
+    BRIG_OPCODE_SAD = 64,                   //.k=SOURCE_TYPE
+    BRIG_OPCODE_SADHI = 65,                 //.k=SOURCE_TYPE
+    BRIG_OPCODE_SEGMENTP = 66,              //.k=SEG_CVT
+    BRIG_OPCODE_FTOS = 67,                  //.k=SEG_CVT
+    BRIG_OPCODE_STOF = 68,                  //.k=SEG_CVT
+    BRIG_OPCODE_CMP = 69,                   //.k=CMP
+    BRIG_OPCODE_CVT = 70,                   //.k=CVT
+    BRIG_OPCODE_LD = 71,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_ST = 72,                    //.k=MEM            //.has_memory_order //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_ATOMIC = 73,                //.k=ATOMIC
+    BRIG_OPCODE_ATOMICNORET = 74,           //.k=ATOMIC         //.numdst=0
+    BRIG_OPCODE_SIGNAL = 75,                //.k=SIGNAL
+    BRIG_OPCODE_SIGNALNORET = 76,           //.k=SIGNAL         //.numdst=0
+    BRIG_OPCODE_MEMFENCE = 77,              //.k=MEM_FENCE      //.numdst=0
+    BRIG_OPCODE_RDIMAGE = 78,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_LDIMAGE = 79,               //.k=IMAGE          //.vecOpndIndex=0
+    BRIG_OPCODE_STIMAGE = 80,               //.k=IMAGE          //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_IMAGEFENCE = 81,            //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_QUERYIMAGE = 82,            //.k=QUERY_IMAGE
+    BRIG_OPCODE_QUERYSAMPLER = 83,          //.k=QUERY_SAMPLER
+    BRIG_OPCODE_CBR = 84,                   //.k=BR             //.numdst=0
+    BRIG_OPCODE_BR = 85,                    //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_SBR = 86,                   //.k=BR             //.numdst=0     //.psopnd=SbrOperands
+    BRIG_OPCODE_BARRIER = 87,               //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_WAVEBARRIER = 88,           //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_ARRIVEFBAR = 89,            //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_INITFBAR = 90,              //.k=BASIC_NO_TYPE  //.numdst=0     //.hasType=false
+    BRIG_OPCODE_JOINFBAR = 91,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LEAVEFBAR = 92,             //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_RELEASEFBAR = 93,           //.k=BASIC_NO_TYPE  //.numdst=0
+    BRIG_OPCODE_WAITFBAR = 94,              //.k=BR             //.numdst=0     //.hasType=false
+    BRIG_OPCODE_LDF = 95,
+    BRIG_OPCODE_ACTIVELANECOUNT = 96,       //.k=LANE
+    BRIG_OPCODE_ACTIVELANEID = 97,          //.k=LANE
+    BRIG_OPCODE_ACTIVELANEMASK = 98,        //.k=LANE           //.vecOpndIndex=0
+    BRIG_OPCODE_ACTIVELANEPERMUTE = 99,     //.k=LANE
+    BRIG_OPCODE_CALL = 100,                 //.k=BR             //.psopnd=CallOperands //.numdst=0 //.hasType=false
+    BRIG_OPCODE_SCALL = 101,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_ICALL = 102,                //.k=BR             //.psopnd=CallOperands //.numdst=0
+    BRIG_OPCODE_RET = 103,                  //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_ALLOCA = 104,               //.k=MEM
+    BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+    BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+    BRIG_OPCODE_DIM = 107,
+    BRIG_OPCODE_GRIDGROUPS = 108,
+    BRIG_OPCODE_GRIDSIZE = 109,
+    BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+    BRIG_OPCODE_PACKETID = 111,
+    BRIG_OPCODE_WORKGROUPID = 112,
+    BRIG_OPCODE_WORKGROUPSIZE = 113,
+    BRIG_OPCODE_WORKITEMABSID = 114,
+    BRIG_OPCODE_WORKITEMFLATABSID = 115,
+    BRIG_OPCODE_WORKITEMFLATID = 116,
+    BRIG_OPCODE_WORKITEMID = 117,
+    BRIG_OPCODE_CLEARDETECTEXCEPT = 118,    //.numdst=0
+    BRIG_OPCODE_GETDETECTEXCEPT = 119,
+    BRIG_OPCODE_SETDETECTEXCEPT = 120,      //.numdst=0
+    BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121,   //.k=QUEUE
+    BRIG_OPCODE_CASQUEUEWRITEINDEX = 122,   //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEREADINDEX = 123,     //.k=QUEUE
+    BRIG_OPCODE_LDQUEUEWRITEINDEX = 124,    //.k=QUEUE
+    BRIG_OPCODE_STQUEUEREADINDEX = 125,     //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_STQUEUEWRITEINDEX = 126,    //.k=QUEUE      //.numdst=0
+    BRIG_OPCODE_CLOCK = 127,
+    BRIG_OPCODE_CUID = 128,
+    BRIG_OPCODE_DEBUGTRAP = 129,            //.numdst=0
+    BRIG_OPCODE_GROUPBASEPTR = 130,
+    BRIG_OPCODE_KERNARGBASEPTR = 131,
+    BRIG_OPCODE_LANEID = 132,
+    BRIG_OPCODE_MAXCUID = 133,
+    BRIG_OPCODE_MAXWAVEID = 134,
+    BRIG_OPCODE_NULLPTR = 135,              //.k=SEG
+    BRIG_OPCODE_WAVEID = 136,
+    BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip
+
+    BRIG_OPCODE_GCNMADU = (1u << 15) | 0,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMADS = (1u << 15) | 1,           //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2,
+    BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3,
+    BRIG_OPCODE_GCNMED3 = (1u << 15) | 4,
+    BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5,         //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6,      //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8,     //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNBFM = (1u << 15) | 9,
+    BRIG_OPCODE_GCNLD = (1u << 15) | 10,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNST = (1u << 15) | 11,            //.k=MEM            //.has_memory_order //.vecOpndIndex=0
+    BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12,        //.k=ATOMIC
+    BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13,   //.k=ATOMIC         //.mnemo=gcn_atomicNoRet
+    BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14,
+    BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15,
+    BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16,   //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc
+    BRIG_OPCODE_GCNMSAD = (1u << 15) | 17,
+    BRIG_OPCODE_GCNQSAD = (1u << 15) | 18,
+    BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19,
+    BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20,        //.k=BASIC_NO_TYPE
+    BRIG_OPCODE_GCNSADW = (1u << 15) | 21,
+    BRIG_OPCODE_GCNSADD = (1u << 15) | 22,
+    BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23,       //.k=ADDR           //.mnemo=gcn_atomic_consume
+    BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24,        //.k=ADDR           //.mnemo=gcn_atomic_append
+    BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25,        //.mnemo=gcn_b4xchg
+    BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26,       //.mnemo=gcn_b32xchg
+    BRIG_OPCODE_GCNMAX = (1u << 15) | 27,
+    BRIG_OPCODE_GCNMIN = (1u << 15) | 28,
+    BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29,    //.k=BASIC_OR_MOD
+    BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30,
+
+    BRIG_OPCODE_AMDRDIMAGELOD  = (1u << 15) | 31,    //.k=IMAGE //.mnemo=amd_rdimagelod  //.vecOpndIndex=0
+    BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32,    //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0
+    BRIG_OPCODE_AMDLDIMAGEMIP  = (1u << 15) | 33,    //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0
+    BRIG_OPCODE_AMDSTIMAGEMIP  = (1u << 15) | 34,    //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0
+    BRIG_OPCODE_AMDQUERYIMAGE  = (1u << 15) | 35     //.k=QUERY_IMAGE //.mnemo=amd_queryimage
+};
+
+enum BrigPack {
+
+    //.tdcaption="Packing"
+    //
+    //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc }
+    //.mnemo_token=_EMPacking
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_PACK_NONE = 0, //.mnemo=""
+    BRIG_PACK_PP = 1,
+    BRIG_PACK_PS = 2,
+    BRIG_PACK_SP = 3,
+    BRIG_PACK_SS = 4,
+    BRIG_PACK_S = 5,
+    BRIG_PACK_P = 6,
+    BRIG_PACK_PPSAT = 7,
+    BRIG_PACK_PSSAT = 8,
+    BRIG_PACK_SPSAT = 9,
+    BRIG_PACK_SSSAT = 10,
+    BRIG_PACK_SSAT = 11,
+    BRIG_PACK_PSAT = 12
+};
+
+enum BrigProfile {
+
+    //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc }
+    //.mnemo_token=ETargetProfile
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_PROFILE_BASE = 0,
+    BRIG_PROFILE_FULL = 1,
+
+    BRIG_PROFILE_UNDEF = 2 //.skip
+};
+
+enum BrigRegisterKind {
+
+    //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) }
+    //
+    //.bits={ }
+    //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1"
+    //
+    //.nollvm
+
+    BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1
+    BRIG_REGISTER_KIND_SINGLE = 1,  //.bits=32
+    BRIG_REGISTER_KIND_DOUBLE = 2,  //.bits=64
+    BRIG_REGISTER_KIND_QUAD = 3     //.bits=128
+};
+
+enum BrigRound {
+
+    //.mnemo={}
+    //.mnemo_fn=round2str //.mnemo_token=_EMRound
+    //
+    //.sat={/_SAT$/? "true" : "false"}
+    //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding"
+    //.sat_default="return false"
+    //
+    //.sig={/_SIGNALING_/? "true" : "false"}
+    //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding"
+    //.sig_default="return false"
+    //
+    //.int={/_INTEGER_/? "true" : "false"}
+    //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding"
+    //.int_default="return false"
+    //
+    //.flt={/_FLOAT_/? "true" : "false"}
+    //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding"
+    //.flt_default="return false"
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_ROUND_NONE = 0,                                    //.no_mnemo
+    BRIG_ROUND_FLOAT_DEFAULT = 1,                           //.no_mnemo
+    BRIG_ROUND_FLOAT_NEAR_EVEN = 2,                         //.mnemo=near
+    BRIG_ROUND_FLOAT_ZERO = 3,                              //.mnemo=zero
+    BRIG_ROUND_FLOAT_PLUS_INFINITY = 4,                     //.mnemo=up
+    BRIG_ROUND_FLOAT_MINUS_INFINITY = 5,                    //.mnemo=down
+    BRIG_ROUND_INTEGER_NEAR_EVEN = 6,                       //.mnemo=neari
+    BRIG_ROUND_INTEGER_ZERO = 7,                            //.mnemo=zeroi
+    BRIG_ROUND_INTEGER_PLUS_INFINITY = 8,                   //.mnemo=upi
+    BRIG_ROUND_INTEGER_MINUS_INFINITY = 9,                  //.mnemo=downi
+    BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10,                  //.mnemo=neari_sat
+    BRIG_ROUND_INTEGER_ZERO_SAT = 11,                       //.mnemo=zeroi_sat
+    BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12,              //.mnemo=upi_sat
+    BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13,             //.mnemo=downi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14,            //.mnemo=sneari
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15,                 //.mnemo=szeroi
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16,        //.mnemo=supi
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17,       //.mnemo=sdowni
+    BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18,        //.mnemo=sneari_sat
+    BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19,             //.mnemo=szeroi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20,    //.mnemo=supi_sat
+    BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21    //.mnemo=sdowni_sat
+};
+
+enum BrigSamplerAddressing {
+
+    //.mnemo={ s/^BRIG_ADDRESSING_//;lc }
+    //.mnemo_token=ESamplerAddressingMode
+
+    BRIG_ADDRESSING_UNDEFINED = 0,
+    BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+    BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+    BRIG_ADDRESSING_REPEAT = 3,
+    BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+    BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerCoordNormalization {
+
+    //.mnemo={ s/^BRIG_COORD_//;lc }
+    //.mnemo_token=ESamplerCoord
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_COORD_UNNORMALIZED = 0,
+    BRIG_COORD_NORMALIZED = 1
+};
+
+enum BrigSamplerFilter {
+
+    //.mnemo={ s/^BRIG_FILTER_//;lc }
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_FILTER_NEAREST = 0,
+    BRIG_FILTER_LINEAR = 1,
+
+    BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerQuery {
+
+    //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc }
+    //.mnemo_token=_EMSamplerQuery
+    //
+    //.print=$mnemo{ $mnemo }
+
+    BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+    BRIG_SAMPLER_QUERY_COORD = 1,
+    BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+enum BrigSectionIndex {
+
+    //.nollvm
+    //
+    //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc }
+
+    BRIG_SECTION_INDEX_DATA = 0,
+    BRIG_SECTION_INDEX_CODE = 1,
+    BRIG_SECTION_INDEX_OPERAND = 2,
+    BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+
+    // used internally
+    BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip
+};
+
+enum BrigSegCvtModifierMask {
+    BRIG_SEG_CVT_NONULL = 1         //.mnemo="nonull" //.print="_nonull"
+};
+
+enum BrigSegment {
+
+    //.mnemo={ s/^BRIG_SEGMENT_//;lc}
+    //.mnemo_token=_EMSegment
+    //.mnemo_context=EInstModifierContext
+    //
+    //.print=$mnemo{ $mnemo ? "_$mnemo" : "" }
+
+    BRIG_SEGMENT_NONE = 0, //.mnemo=""
+    BRIG_SEGMENT_FLAT = 1, //.mnemo=""
+    BRIG_SEGMENT_GLOBAL = 2,
+    BRIG_SEGMENT_READONLY = 3,
+    BRIG_SEGMENT_KERNARG = 4,
+    BRIG_SEGMENT_GROUP = 5,
+    BRIG_SEGMENT_PRIVATE = 6,
+    BRIG_SEGMENT_SPILL = 7,
+    BRIG_SEGMENT_ARG = 8,
+
+    BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip
+
+    BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region"
+};
+
+enum BrigPackedTypeBits {
+
+    //.nodump
+    //
+    //.nollvm
+
+    BRIG_TYPE_BASE_SIZE  = 5,
+    BRIG_TYPE_PACK_SIZE  = 2,
+    BRIG_TYPE_ARRAY_SIZE = 1,
+
+    BRIG_TYPE_BASE_SHIFT  = 0,
+    BRIG_TYPE_PACK_SHIFT  = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+    BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+    BRIG_TYPE_BASE_MASK  = ((1 << BRIG_TYPE_BASE_SIZE)  - 1) << BRIG_TYPE_BASE_SHIFT,
+    BRIG_TYPE_PACK_MASK  = ((1 << BRIG_TYPE_PACK_SIZE)  - 1) << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+    BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_32   = 1 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_64   = 2 << BRIG_TYPE_PACK_SHIFT,
+    BRIG_TYPE_PACK_128  = 3 << BRIG_TYPE_PACK_SHIFT,
+
+    BRIG_TYPE_ARRAY     = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+enum BrigType {
+
+    //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef }
+    //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0"
+    //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef }
+    //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0"
+    //
+    //.mnemo={ s/^BRIG_TYPE_//;lc }
+    //.mnemo_token=_EMType
+    //
+    //.array={/ARRAY$/?"true":"false"}
+    //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type"
+    //.array_default="return false"
+    //
+    //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"}
+    //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type"
+    //.a2e_default="return BRIG_TYPE_NONE"
+    //
+    //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"}
+    //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type"
+    //.e2a_default="return BRIG_TYPE_NONE"
+    //
+    //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc}
+    //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type"
+    //.t2s_default="return NULL"
+    //
+    //.dispatch_switch //.dispatch_incfile=TemplateUtilities
+    //.dispatch_proto="template<typename RetType, typename Visitor>\nRetType dispatchByType_gen(unsigned type, Visitor& v)"
+    //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" }
+    //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)"
+    //
+    //- .tdname=BrigType
+    //
+    //.print=$mnemo{ "_$mnemo" }
+
+    BRIG_TYPE_NONE  = 0,  //.mnemo=""       //.print=""
+    BRIG_TYPE_U8    = 1,  //.ctype=uint8_t
+    BRIG_TYPE_U16   = 2,  //.ctype=uint16_t
+    BRIG_TYPE_U32   = 3,  //.ctype=uint32_t
+    BRIG_TYPE_U64   = 4,  //.ctype=uint64_t
+    BRIG_TYPE_S8    = 5,  //.ctype=int8_t
+    BRIG_TYPE_S16   = 6,  //.ctype=int16_t
+    BRIG_TYPE_S32   = 7,  //.ctype=int32_t
+    BRIG_TYPE_S64   = 8,  //.ctype=int64_t
+    BRIG_TYPE_F16   = 9,  //.ctype=f16_t
+    BRIG_TYPE_F32   = 10, //.ctype=float
+    BRIG_TYPE_F64   = 11, //.ctype=double
+    BRIG_TYPE_B1    = 12, //.ctype=bool     //.numBytes=1
+    BRIG_TYPE_B8    = 13, //.ctype=uint8_t
+    BRIG_TYPE_B16   = 14, //.ctype=uint16_t
+    BRIG_TYPE_B32   = 15, //.ctype=uint32_t
+    BRIG_TYPE_B64   = 16, //.ctype=uint64_t
+    BRIG_TYPE_B128  = 17, //.ctype=b128_t
+    BRIG_TYPE_SAMP  = 18, //.mnemo=samp     //.numBits=64
+    BRIG_TYPE_ROIMG = 19, //.mnemo=roimg    //.numBits=64
+    BRIG_TYPE_WOIMG = 20, //.mnemo=woimg    //.numBits=64
+    BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg    //.numBits=64
+    BRIG_TYPE_SIG32 = 22, //.mnemo=sig32    //.numBits=64
+    BRIG_TYPE_SIG64 = 23, //.mnemo=sig64    //.numBits=64
+
+    BRIG_TYPE_U8X4  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_32,  //.ctype=uint8_t
+    BRIG_TYPE_U8X8  = BRIG_TYPE_U8  | BRIG_TYPE_PACK_64,  //.ctype=uint8_t
+    BRIG_TYPE_U8X16 = BRIG_TYPE_U8  | BRIG_TYPE_PACK_128, //.ctype=uint8_t
+    BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32,  //.ctype=uint16_t
+    BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64,  //.ctype=uint16_t
+    BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t
+    BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64,  //.ctype=uint32_t
+    BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t
+    BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t
+    BRIG_TYPE_S8X4  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_32,  //.ctype=int8_t
+    BRIG_TYPE_S8X8  = BRIG_TYPE_S8  | BRIG_TYPE_PACK_64,  //.ctype=int8_t
+    BRIG_TYPE_S8X16 = BRIG_TYPE_S8  | BRIG_TYPE_PACK_128, //.ctype=int8_t
+    BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32,  //.ctype=int16_t
+    BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64,  //.ctype=int16_t
+    BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t
+    BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64,  //.ctype=int32_t
+    BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t
+    BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t
+    BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32,  //.ctype=f16_t
+    BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64,  //.ctype=f16_t
+    BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t
+    BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64,  //.ctype=float
+    BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float
+    BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double
+
+    BRIG_TYPE_U8_ARRAY    = BRIG_TYPE_U8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16_ARRAY   = BRIG_TYPE_U16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32_ARRAY   = BRIG_TYPE_U32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64_ARRAY   = BRIG_TYPE_U64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8_ARRAY    = BRIG_TYPE_S8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16_ARRAY   = BRIG_TYPE_S16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32_ARRAY   = BRIG_TYPE_S32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64_ARRAY   = BRIG_TYPE_S64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16_ARRAY   = BRIG_TYPE_F16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32_ARRAY   = BRIG_TYPE_F32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64_ARRAY   = BRIG_TYPE_F64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B8_ARRAY    = BRIG_TYPE_B8    | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B16_ARRAY   = BRIG_TYPE_B16   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B32_ARRAY   = BRIG_TYPE_B32   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B64_ARRAY   = BRIG_TYPE_B64   | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_B128_ARRAY  = BRIG_TYPE_B128  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SAMP_ARRAY  = BRIG_TYPE_SAMP  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X4_ARRAY  = BRIG_TYPE_U8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X8_ARRAY  = BRIG_TYPE_U8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X4_ARRAY  = BRIG_TYPE_S8X4  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X8_ARRAY  = BRIG_TYPE_S8X8  | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+    BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY,  //.mnemo=""     //.print=""
+
+    // Used internally
+    BRIG_TYPE_INVALID = (unsigned) -1 //.skip
+};
+
+enum BrigVariableModifierMask {
+
+    //.nodump
+
+    BRIG_VARIABLE_DEFINITION = 1,
+    BRIG_VARIABLE_CONST = 2
+};
+
+enum BrigWidth {
+
+    //.tddef=1
+    //
+    //.print={ s/^BRIG_WIDTH_//; "_width($_)" }
+
+    BRIG_WIDTH_NONE = 0,
+    BRIG_WIDTH_1 = 1,
+    BRIG_WIDTH_2 = 2,
+    BRIG_WIDTH_4 = 3,
+    BRIG_WIDTH_8 = 4,
+    BRIG_WIDTH_16 = 5,
+    BRIG_WIDTH_32 = 6,
+    BRIG_WIDTH_64 = 7,
+    BRIG_WIDTH_128 = 8,
+    BRIG_WIDTH_256 = 9,
+    BRIG_WIDTH_512 = 10,
+    BRIG_WIDTH_1024 = 11,
+    BRIG_WIDTH_2048 = 12,
+    BRIG_WIDTH_4096 = 13,
+    BRIG_WIDTH_8192 = 14,
+    BRIG_WIDTH_16384 = 15,
+    BRIG_WIDTH_32768 = 16,
+    BRIG_WIDTH_65536 = 17,
+    BRIG_WIDTH_131072 = 18,
+    BRIG_WIDTH_262144 = 19,
+    BRIG_WIDTH_524288 = 20,
+    BRIG_WIDTH_1048576 = 21,
+    BRIG_WIDTH_2097152 = 22,
+    BRIG_WIDTH_4194304 = 23,
+    BRIG_WIDTH_8388608 = 24,
+    BRIG_WIDTH_16777216 = 25,
+    BRIG_WIDTH_33554432 = 26,
+    BRIG_WIDTH_67108864 = 27,
+    BRIG_WIDTH_134217728 = 28,
+    BRIG_WIDTH_268435456 = 29,
+    BRIG_WIDTH_536870912 = 30,
+    BRIG_WIDTH_1073741824 = 31,
+    BRIG_WIDTH_2147483648 = 32,
+    BRIG_WIDTH_WAVESIZE = 33,
+    BRIG_WIDTH_ALL = 34,
+
+    BRIG_WIDTH_LAST //.skip
+};
+
+struct BrigUInt64 { //.isroot //.standalone
+    uint32_t lo;     //.defValue=0
+    uint32_t hi;     //.defValue=0
+
+    //+hcode KLASS& operator=(uint64_t rhs);
+    //+hcode operator uint64_t();
+    //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; }
+    //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); }
+};
+
+struct BrigAluModifier { //.isroot //.standalone
+    BrigAluModifier8_t allBits; //.defValue=0
+    //^^ bool ftz; //.wtype=BitValRef<0>
+};
+
+struct BrigBase { //.nowrap
+    uint16_t byteCount;
+    BrigKind16_t kind;
+};
+
+//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE };
+//.alias Directive:Code { //.generic };
+//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND };
+
+struct BrigData {
+    //.nowrap
+    uint32_t byteCount;
+    uint8_t bytes[1];
+};
+
+struct BrigExecutableModifier { //.isroot //.standalone
+    BrigExecutableModifier8_t allBits; //.defValue=0
+    //^^ bool isDefinition; //.wtype=BitValRef<0>
+};
+
+struct BrigMemoryModifier { //.isroot //.standalone
+    BrigMemoryModifier8_t allBits; //.defValue=0
+    //^^ bool isConst; //.wtype=BitValRef<0>
+};
+
+struct BrigSegCvtModifier { //.isroot //.standalone
+    BrigSegCvtModifier8_t allBits; //.defValue=0
+    //^^ bool isNoNull; //.wtype=BitValRef<0>
+};
+
+struct BrigVariableModifier { //.isroot //.standalone
+    BrigVariableModifier8_t allBits;    //.defValue=0
+
+    //^^ bool isDefinition;     //.wtype=BitValRef<0>
+    //^^ bool isConst;          //.wtype=BitValRef<1>
+};
+
+struct BrigDirectiveArgBlockEnd {
+    BrigBase base;
+};
+
+struct BrigDirectiveArgBlockStart {
+    BrigBase base;
+};
+
+struct BrigDirectiveComment {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+    BrigBase base;
+    BrigControlDirective16_t control;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable { //.generic
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    uint16_t outArgCount; //.defValue=0
+    uint16_t inArgCount;  //.defValue=0
+    BrigCodeOffset32_t firstInArg;
+    BrigCodeOffset32_t firstCodeBlockEntry;
+    BrigCodeOffset32_t nextModuleEntry;
+    BrigExecutableModifier modifier; //.acc=subItem<ExecutableModifier> //.wtype=ExecutableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+//.alias DirectiveKernel:DirectiveExecutable { };
+//.alias DirectiveFunction:DirectiveExecutable { };
+//.alias DirectiveSignature:DirectiveExecutable { };
+//.alias DirectiveIndirectFunction:DirectiveExecutable { };
+
+struct BrigDirectiveExtension {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveLabel {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+    BrigBase base;
+    BrigDataOffsetString32_t filename;
+    uint32_t line;
+    uint32_t column; //.defValue=1
+};
+
+struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE
+    BrigBase base;
+};
+
+struct BrigDirectivePragma {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigOperandOffset32_t init;
+    BrigType16_t type;
+
+    //+hcode bool isArray();
+    //+implcode inline bool KLASS::isArray() { return isArrayType(type()); }
+
+    //+hcode unsigned elementType();
+    //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); }
+
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    BrigUInt64 dim; //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+    BrigLinkage8_t linkage;
+    BrigAllocation8_t allocation;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveModule {
+    BrigBase base;
+    BrigDataOffsetString32_t name;
+    BrigVersion32_t hsailMajor;         //.wtype=ValRef<uint32_t>
+    BrigVersion32_t hsailMinor;         //.wtype=ValRef<uint32_t>
+    BrigProfile8_t profile;
+    BrigMachineModel8_t machineModel;
+    BrigRound8_t defaultFloatRound;
+    uint8_t reserved;                   //.defValue=0
+};
+
+struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode
+    BrigBase base;
+    BrigOpcode16_t opcode;
+    BrigType16_t type;
+    BrigDataOffsetOperandList32_t operands;
+
+    //+hcode Operand operand(int index);
+    //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; }
+};
+
+struct BrigInstAddr {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstAtomic {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t memoryScope;
+    BrigAtomicOperation8_t atomicOperation;
+    uint8_t equivClass;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstBasic {
+    BrigInstBase base;
+};
+
+struct BrigInstBr {
+    BrigInstBase base;
+    BrigWidth8_t width;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCmp {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigCompareOperation8_t compare;
+    BrigPack8_t pack;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+};
+
+struct BrigInstImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigType16_t coordType;
+    BrigImageGeometry8_t geometry;
+    uint8_t equivClass;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstLane {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigWidth8_t width;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstMem {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigAlignment8_t align;
+    uint8_t equivClass;
+    BrigWidth8_t width;
+    BrigMemoryModifier modifier; //.acc=subItem<MemoryModifier> //.wtype=MemoryModifier
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstMemFence {
+    BrigInstBase base;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigMemoryScope8_t globalSegmentMemoryScope;
+    BrigMemoryScope8_t groupSegmentMemoryScope;
+    BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+    BrigInstBase base;
+    BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+    BrigRound8_t round;
+    BrigPack8_t pack;
+    uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstQueryImage {
+    BrigInstBase base;
+    BrigType16_t imageType;
+    BrigImageGeometry8_t geometry;
+    BrigImageQuery8_t imageQuery;
+};
+
+struct BrigInstQuerySampler {
+    BrigInstBase base;
+    BrigSamplerQuery8_t samplerQuery;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstQueue {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    BrigMemoryOrder8_t memoryOrder;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstSeg {
+    BrigInstBase base;
+    BrigSegment8_t segment;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstSegCvt {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    BrigSegment8_t segment;
+    BrigSegCvtModifier modifier; //.acc=subItem<SegCvtModifier> //.wtype=SegCvtModifier
+};
+
+struct BrigInstSignal {
+    BrigInstBase base;
+    BrigType16_t signalType;
+    BrigMemoryOrder8_t memoryOrder;
+    BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+    BrigInstBase base;
+    BrigType16_t sourceType;
+    uint16_t reserved; //.defValue=0
+};
+
+struct BrigOperandAddress {
+    BrigBase base;
+    BrigCodeOffset32_t symbol; //.wtype=ItemRef<DirectiveVariable>
+    BrigOperandOffset32_t reg; //.wtype=ItemRef<OperandRegister>
+    BrigUInt64 offset; //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandAlign {
+    BrigBase base;
+    BrigAlignment8_t align;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandCodeList {
+    BrigBase base;
+    BrigDataOffsetCodeList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Code elements(int index);
+    //+implcode inline Code KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandCodeRef {
+    BrigBase base;
+    BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+    BrigBase base;
+    BrigType16_t type; //.defValue=0
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+    BrigBase base;
+    BrigType16_t type;
+    uint16_t reserved; //.defValue=0
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandConstantImage {
+    BrigBase base;
+    BrigType16_t type;
+    BrigImageGeometry8_t geometry;
+    BrigImageChannelOrder8_t channelOrder;
+    BrigImageChannelType8_t channelType;
+    uint8_t reserved[3]; //.defValue=0
+    BrigUInt64 width;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 height;   //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 depth;    //.acc=subItem<UInt64> //.wtype=UInt64
+    BrigUInt64 array;    //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandOperandList {
+    BrigBase base;
+    BrigDataOffsetOperandList32_t elements;
+
+    //+hcode unsigned elementCount();
+    //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+    //+hcode Operand elements(int index);
+    //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandRegister {
+    BrigBase base;
+    BrigRegisterKind16_t regKind;
+    uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+    BrigBase base;
+    BrigType16_t type;
+    BrigSamplerCoordNormalization8_t coord;
+    BrigSamplerFilter8_t filter;
+    BrigSamplerAddressing8_t addressing;
+    uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandString {
+    BrigBase base;
+    BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+    BrigBase base;
+};
+
+//.ignore{
+
+enum BrigExceptionsMask {
+    BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+    BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+    BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+    BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+    BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+    BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+    uint64_t byteCount;
+    uint32_t headerByteCount;
+    uint32_t nameLength;
+    uint8_t name[1];
+};
+
+#define MODULE_IDENTIFICATION_LENGTH (8)
+
+struct BrigModuleHeader {
+    char identification[MODULE_IDENTIFICATION_LENGTH];
+    BrigVersion32_t brigMajor;
+    BrigVersion32_t brigMinor;
+    uint64_t byteCount;
+    uint8_t hash[64];
+    uint32_t reserved;
+    uint32_t sectionCount;
+    uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#endif // defined(INCLUDED_BRIG_H)
+//}
diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript
new file mode 100644
index 000000000..3455823a6
--- /dev/null
+++ b/src/arch/hsail/SConscript
@@ -0,0 +1,54 @@
+# -*- mode:python -*-
+
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
+                'gen.py', '$SOURCE $TARGETS')
+
+    Source('generic_types.cc')
+    Source('gpu_decoder.cc')
+    Source('insts/branch.cc')
+    Source('insts/gen_exec.cc')
+    Source('insts/gpu_static_inst.cc')
+    Source('insts/main.cc')
+    Source('insts/pseudo_inst.cc')
+    Source('insts/mem.cc')
+    Source('operand.cc')
diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts
new file mode 100644
index 000000000..641963c82
--- /dev/null
+++ b/src/arch/hsail/SConsopts
@@ -0,0 +1,40 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+all_gpu_isa_list.append('hsail')
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
new file mode 100755
index 000000000..f2996019b
--- /dev/null
+++ b/src/arch/hsail/gen.py
@@ -0,0 +1,806 @@
+#! /usr/bin/python
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+import sys, re
+
+from m5.util import code_formatter
+
+if len(sys.argv) != 4:
+    print "Error: need 3 args (file names)"
+    sys.exit(0)
+
+header_code = code_formatter()
+decoder_code = code_formatter()
+exec_code = code_formatter()
+
+###############
+#
+# Generate file prologs (includes etc.)
+#
+###############
+
+header_code('''
+#include "arch/hsail/insts/decl.hh"
+#include "base/bitfield.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+''')
+header_code.indent()
+
+decoder_code('''
+#include "arch/hsail/gpu_decoder.hh"
+#include "arch/hsail/insts/branch.hh"
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gen_decl.hh"
+#include "arch/hsail/insts/mem.hh"
+#include "arch/hsail/insts/mem_impl.hh"
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    std::vector<GPUStaticInst*> Decoder::decodedInsts;
+
+    GPUStaticInst*
+    Decoder::decode(MachInst machInst)
+    {
+        using namespace Brig;
+
+        const BrigInstBase *ib = machInst.brigInstBase;
+        const BrigObject *obj = machInst.brigObj;
+
+        switch(ib->opcode) {
+''')
+decoder_code.indent()
+decoder_code.indent()
+
+exec_code('''
+#include "arch/hsail/insts/gen_decl.hh"
+#include "base/intmath.hh"
+
+namespace HsailISA
+{
+''')
+exec_code.indent()
+
+###############
+#
+# Define code templates for class declarations (for header file)
+#
+###############
+
+# Basic header template for an instruction with no template parameters.
+header_template_nodt = '''
+class $class_name : public $base_class
+{
+  public:
+    typedef $base_class Base;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Basic header template for an instruction with a single DataType
+# template parameter.
+header_template_1dt = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_template_1dt_noexec = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+  public:
+    typedef $base_class<DataType> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+};
+
+'''
+
+# Same as header_template_1dt, except the base class has a second
+# template parameter NumSrcOperands to allow a variable number of
+# source operands.  Note that since this is implemented with an array,
+# it only works for instructions where all sources are of the same
+# type (like most arithmetics).
+header_template_1dt_varsrcs = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType, $num_srcs>
+{
+  public:
+    typedef $base_class<DataType, $num_srcs> Base;
+    typedef typename DataType::CType CType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Header template for instruction with two DataType template
+# parameters, one for the dest and one for the source.  This is used
+# by compare and convert.
+header_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+class $class_name : public $base_class<DestDataType, SrcDataType>
+{
+  public:
+    typedef $base_class<DestDataType, SrcDataType> Base;
+    typedef typename DestDataType::CType DestCType;
+    typedef typename SrcDataType::CType SrcCType;
+
+    $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+       : Base(ib, obj, "$opcode")
+    {
+    }
+
+    void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_templates = {
+    'ArithInst': header_template_1dt_varsrcs,
+    'CmovInst': header_template_1dt,
+    'ClassInst': header_template_1dt,
+    'ShiftInst': header_template_1dt,
+    'ExtractInsertInst': header_template_1dt,
+    'CmpInst': header_template_2dt,
+    'CvtInst': header_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': header_template_nodt,
+    'SpecialInst1Src': header_template_nodt,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for exec functions
+#
+###############
+
+# exec function body
+exec_template_nodt_nosrc = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val = $expr;
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_nodt_1src = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef Base::DestCType DestCType;
+    typedef Base::SrcCType  SrcCType;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
+            DestCType dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_varsrcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<CType>(w, lane);
+            }
+
+            dest_val = (CType)($expr);
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_3srcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::Src0CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+    typedef typename Base::Src2CType Src2T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+            Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_1dt_2src_1dest = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    typedef typename Base::DestCType DestT;
+    typedef CType Src0T;
+    typedef typename Base::Src1CType Src1T;
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestT dest_val;
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<DestT>(w, lane);
+            }
+            Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+            Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_shift = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            CType dest_val;
+
+            if ($dest_is_src_flag) {
+                dest_val = this->dest.template get<CType>(w, lane);
+            }
+
+            CType src_val0 = this->src0.template get<CType>(w, lane);
+            uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+void
+$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+    Wavefront *w = gpuDynInst->wavefront();
+
+    const VectorMask &mask = w->get_pred();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (mask[lane]) {
+            DestCType dest_val;
+            SrcCType src_val[$num_srcs];
+
+            for (int i = 0; i < $num_srcs; ++i) {
+                src_val[i] = this->src[i].template get<SrcCType>(w, lane);
+            }
+
+            dest_val = $expr;
+
+            this->dest.set(w, lane, dest_val);
+        }
+    }
+}
+
+'''
+
+exec_templates = {
+    'ArithInst': exec_template_1dt_varsrcs,
+    'CmovInst': exec_template_1dt_3srcs,
+    'ExtractInsertInst': exec_template_1dt_3srcs,
+    'ClassInst': exec_template_1dt_2src_1dest,
+    'CmpInst': exec_template_2dt,
+    'CvtInst': exec_template_2dt,
+    'LdInst': '',
+    'StInst': '',
+    'SpecialInstNoSrc': exec_template_nodt_nosrc,
+    'SpecialInst1Src': exec_template_nodt_1src,
+    'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for the decoder cases
+#
+###############
+
+# decode template for nodt-opcode case
+decode_nodt_template = '''
+  case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
+
+decode_case_prolog_class_inst = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
+        BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
+        //switch (baseOp->kind) {
+        //    case BRIG_OPERAND_REG:
+        //        type = ((const BrigOperandReg*)baseOp)->type;
+        //        break;
+        //    case BRIG_OPERAND_IMMED:
+        //        type = ((const BrigOperandImmed*)baseOp)->type;
+        //        break;
+        //    default:
+        //        fatal("CLASS unrecognized kind of operand %d\\n",
+        //               baseOp->kind);
+        //}
+        switch (type) {'''
+
+# common prolog for 1dt- or 2dt-opcode case: switch on data type
+decode_case_prolog = '''
+  case BRIG_OPCODE_$brig_opcode_upper:
+    {
+        switch (ib->type) {'''
+
+# single-level decode case entry (for 1dt opcodes)
+decode_case_entry = \
+'      case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
+
+decode_store_prolog = \
+'      case BRIG_TYPE_$type_name: {'
+
+decode_store_case_epilog = '''
+    }'''
+
+decode_store_case_entry = \
+'          return $constructor(ib, obj);'
+
+# common epilog for type switch
+decode_case_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
+              ib->type);
+        }
+    }
+    break;'''
+
+# Additional templates for nested decode on a second type field (for
+# compare and convert).  These are used in place of the
+# decode_case_entry template to create a second-level switch on on the
+# second type field inside each case of the first-level type switch.
+# Because the name and location of the second type can vary, the Brig
+# instruction type must be provided in $brig_type, and the name of the
+# second type field must be provided in $type_field.
+decode_case2_prolog = '''
+        case BRIG_TYPE_$type_name:
+          switch (((Brig$brig_type*)ib)->$type2_field) {'''
+
+decode_case2_entry = \
+'          case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
+
+decode_case2_epilog = '''
+          default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
+                         ((Brig$brig_type*)ib)->$type2_field);
+        }
+        break;'''
+
+# Figure out how many source operands an expr needs by looking for the
+# highest-numbered srcN value referenced.  Since sources are numbered
+# starting at 0, the return value is N+1.
+def num_src_operands(expr):
+    if expr.find('src2') != -1:
+        return 3
+    elif expr.find('src1') != -1:
+        return 2
+    elif expr.find('src0') != -1:
+        return 1
+    else:
+        return 0
+
+###############
+#
+# Define final code generation methods
+#
+# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
+# generating actual instructions.
+#
+###############
+
+# Generate class declaration, exec function, and decode switch case
+# for an brig_opcode with a single-level type switch.  The 'types'
+# parameter is a list or tuple of types for which the instruction
+# should be instantiated.
+def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
+        type2_info=None, constructor_prefix='new ', is_store=False):
+    brig_opcode_upper = brig_opcode.upper()
+    class_name = brig_opcode
+    opcode = class_name.lower()
+
+    if base_class == 'ArithInst':
+        # note that expr must be provided with ArithInst so we can
+        # derive num_srcs for the template
+        assert expr
+
+    if expr:
+        # Derive several bits of info from expr.  If expr is not used,
+        # this info will be irrelevant.
+        num_srcs = num_src_operands(expr)
+        # if the RHS expression includes 'dest', then we're doing an RMW
+        # on the reg and we need to treat it like a source
+        dest_is_src = expr.find('dest') != -1
+        dest_is_src_flag = str(dest_is_src).lower() # for C++
+        if base_class in ['ShiftInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
+        else:
+            expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+        expr = re.sub(r'\bdest\b', r'dest_val', expr)
+
+    # Strip template arguments off of base class before looking up
+    # appropriate templates
+    base_class_base = re.sub(r'<.*>$', '', base_class)
+    header_code(header_templates[base_class_base])
+
+    if base_class.startswith('SpecialInst'):
+        exec_code(exec_templates[base_class_base])
+    elif base_class.startswith('ShiftInst'):
+        header_code(exec_template_shift)
+    else:
+        header_code(exec_templates[base_class_base])
+
+    if not types or isinstance(types, str):
+        # Just a single type
+        constructor = constructor_prefix + class_name
+        decoder_code(decode_nodt_template)
+    else:
+        # multiple types, need at least one level of decode
+        if brig_opcode == 'Class':
+            decoder_code(decode_case_prolog_class_inst)
+        else:
+            decoder_code(decode_case_prolog)
+        if not type2_info:
+            if is_store == False:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    full_class_name = '%s<%s>' % (class_name, type_name.upper())
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case_entry)
+            else:
+                # single list of types, to basic one-level decode
+                for type_name in types:
+                    decoder_code(decode_store_prolog)
+                    type_size = int(re.findall(r'[0-9]+', type_name)[0])
+                    src_size = 32
+                    type_type = type_name[0]
+                    full_class_name = '%s<%s,%s>' % (class_name, \
+                                                     type_name.upper(), \
+                                                     '%s%d' % \
+                                                     (type_type.upper(), \
+                                                     type_size))
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_store_case_entry)
+                    decoder_code(decode_store_case_epilog)
+        else:
+            # need secondary type switch (convert, compare)
+            # unpack extra info on second switch
+            (type2_field, types2) = type2_info
+            brig_type = 'Inst%s' % brig_opcode
+            for type_name in types:
+                decoder_code(decode_case2_prolog)
+                fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
+                for type2_name in types2:
+                    full_class_name = fmt % type2_name.upper()
+                    constructor = constructor_prefix + full_class_name
+                    decoder_code(decode_case2_entry)
+
+                decoder_code(decode_case2_epilog)
+
+        decoder_code(decode_case_epilog)
+
+###############
+#
+# Generate instructions
+#
+###############
+
+# handy abbreviations for common sets of types
+
+# arithmetic ops are typically defined only on 32- and 64-bit sizes
+arith_int_types = ('S32', 'U32', 'S64', 'U64')
+arith_float_types = ('F32', 'F64')
+arith_types = arith_int_types + arith_float_types
+
+bit_types = ('B1', 'B32', 'B64')
+
+all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
+
+# I think you might be able to do 'f16' memory ops too, but we'll
+# ignore them for now.
+mem_types = all_int_types + arith_float_types
+mem_atom_types = all_int_types + ('B32', 'B64')
+
+##### Arithmetic & logical operations
+gen('Add', arith_types, 'src0 + src1')
+gen('Sub', arith_types, 'src0 - src1')
+gen('Mul', arith_types, 'src0 * src1')
+gen('Div', arith_types, 'src0 / src1')
+gen('Min', arith_types, 'std::min(src0, src1)')
+gen('Max', arith_types, 'std::max(src0, src1)')
+gen('Gcnmin', arith_types, 'std::min(src0, src1)')
+
+gen('CopySign', arith_float_types,
+    'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
+gen('Sqrt', arith_float_types, 'sqrt(src0)')
+gen('Floor', arith_float_types, 'floor(src0)')
+
+# "fast" sqrt... same as slow for us
+gen('Nsqrt', arith_float_types, 'sqrt(src0)')
+gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
+gen('Nrcp', arith_float_types, '1.0/src0')
+gen('Fract', arith_float_types,
+    '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
+
+gen('Ncos', arith_float_types, 'cos(src0)');
+gen('Nsin', arith_float_types, 'sin(src0)');
+
+gen('And', bit_types, 'src0 & src1')
+gen('Or', bit_types,  'src0 | src1')
+gen('Xor', bit_types, 'src0 ^ src1')
+
+gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
+gen('Firstbit',bit_types, 'firstbit(src0)')
+gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
+
+gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
+gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
+
+# gen('Mul_hi', types=('s32','u32', '??'))
+# gen('Mul24', types=('s32','u32', '??'))
+gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
+
+gen('Abs', arith_types, 'std::abs(src0)')
+gen('Neg', arith_types, '-src0')
+
+gen('Mov', bit_types, 'src0')
+gen('Not', bit_types, 'heynot(src0)')
+
+# mad and fma differ only in rounding behavior, which we don't emulate
+# also there's an integer form of mad, but not of fma
+gen('Mad', arith_types, 'src0 * src1 + src2')
+gen('Fma', arith_float_types, 'src0 * src1 + src2')
+
+#native floating point operations
+gen('Nfma', arith_float_types, 'src0 * src1 + src2')
+
+gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
+gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
+gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
+
+# see base/bitfield.hh
+gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
+    'ExtractInsertInst')
+
+gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
+    'ExtractInsertInst')
+
+##### Compare
+gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
+    'CmpInst', ('sourceType', arith_types + bit_types))
+gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
+
+##### Conversion
+
+# Conversion operations are only defined on B1, not B32 or B64
+cvt_types = ('B1',) + mem_types
+
+gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
+
+
+##### Load & Store
+gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
+    is_store=True)
+gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
+gen('AtomicNoRet', mem_atom_types, base_class='StInst',
+    constructor_prefix='decode')
+
+gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
+gen('Br', base_class = 'LdInst', constructor_prefix='decode')
+
+##### Special operations
+def gen_special(brig_opcode, expr, dest_type='U32'):
+    num_srcs = num_src_operands(expr)
+    if num_srcs == 0:
+        base_class = 'SpecialInstNoSrc<%s>' % dest_type
+    elif num_srcs == 1:
+        base_class = 'SpecialInst1Src<%s>' % dest_type
+    else:
+        assert false
+
+    gen(brig_opcode, None, expr, base_class)
+
+gen_special('WorkItemId', 'w->workitemid[src0][lane]')
+gen_special('WorkItemAbsId',
+    'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
+gen_special('WorkGroupId', 'w->workgroupid[src0]')
+gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('GridSize', 'w->gridsz[src0]')
+gen_special('GridGroups',
+    'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
+gen_special('LaneId', 'lane')
+gen_special('WaveId', 'w->dynwaveid')
+gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
+
+# gen_special('CU'', ')
+
+gen('Ret', base_class='SpecialInstNoSrcNoDest')
+gen('Barrier', base_class='SpecialInstNoSrcNoDest')
+gen('MemFence', base_class='SpecialInstNoSrcNoDest')
+
+# Map magic instructions to the BrigSyscall opcode
+# Magic instructions are defined in magic.hh
+#
+# In the future, real HSA kernel system calls can be implemented and coexist
+# with magic instructions.
+gen('Call', base_class='SpecialInstNoSrcNoDest')
+
+###############
+#
+# Generate file epilogs
+#
+###############
+header_code.dedent()
+header_code('''
+} // namespace HsailISA
+''')
+
+# close off main decode switch
+decoder_code.dedent()
+decoder_code.dedent()
+decoder_code('''
+          default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
+        } // end switch(ib->opcode)
+    } // end decode()
+} // namespace HsailISA
+''')
+
+exec_code.dedent()
+exec_code('''
+} // namespace HsailISA
+''')
+
+###############
+#
+# Output accumulated code to files
+#
+###############
+header_code.write(sys.argv[1])
+decoder_code.write(sys.argv[2])
+exec_code.write(sys.argv[3])
diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc
new file mode 100644
index 000000000..0cd55d1d5
--- /dev/null
+++ b/src/arch/hsail/generic_types.cc
@@ -0,0 +1,47 @@
+#include "arch/hsail/generic_types.hh"
+#include "base/misc.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
+    {
+        switch(brig_memory_order) {
+          case BRIG_MEMORY_ORDER_NONE:
+            return Enums::MEMORY_ORDER_NONE;
+          case BRIG_MEMORY_ORDER_RELAXED:
+            return Enums::MEMORY_ORDER_RELAXED;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE;
+          case BRIG_MEMORY_ORDER_SC_RELEASE:
+            return Enums::MEMORY_ORDER_SC_RELEASE;
+          case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+            return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
+                  "bad BrigMemoryOrder\n");
+        }
+    }
+
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
+    {
+        switch(brig_memory_scope) {
+          case BRIG_MEMORY_SCOPE_NONE:
+            return Enums::MEMORY_SCOPE_NONE;
+          case BRIG_MEMORY_SCOPE_WORKITEM:
+            return Enums::MEMORY_SCOPE_WORKITEM;
+          case BRIG_MEMORY_SCOPE_WORKGROUP:
+            return Enums::MEMORY_SCOPE_WORKGROUP;
+          case BRIG_MEMORY_SCOPE_AGENT:
+            return Enums::MEMORY_SCOPE_DEVICE;
+          case BRIG_MEMORY_SCOPE_SYSTEM:
+            return Enums::MEMORY_SCOPE_SYSTEM;
+          default:
+            fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
+                  "bad BrigMemoryScope\n");
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh
new file mode 100644
index 000000000..50e430bef
--- /dev/null
+++ b/src/arch/hsail/generic_types.hh
@@ -0,0 +1,16 @@
+#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
+#define __ARCH_HSAIL_GENERIC_TYPES_HH__
+
+#include "arch/hsail/Brig.h"
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+
+namespace HsailISA
+{
+    Enums::GenericMemoryOrder
+    getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
+    Enums::GenericMemoryScope
+    getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh
new file mode 100644
index 000000000..98a689664
--- /dev/null
+++ b/src/arch/hsail/gpu_decoder.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
+#define __ARCH_HSAIL_GPU_DECODER_HH__
+
+#include <vector>
+
+#include "arch/hsail/gpu_types.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+namespace HsailISA
+{
+    class Decoder
+    {
+      public:
+        GPUStaticInst* decode(MachInst machInst);
+
+        GPUStaticInst*
+        decode(RawMachInst inst)
+        {
+            return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
+        }
+
+        RawMachInst
+        saveInst(GPUStaticInst *decodedInst)
+        {
+            decodedInsts.push_back(decodedInst);
+
+            return decodedInsts.size() - 1;
+        }
+
+      private:
+        static std::vector<GPUStaticInst*> decodedInsts;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GPU_DECODER_HH__
diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh
new file mode 100644
index 000000000..4b3a66a9a
--- /dev/null
+++ b/src/arch/hsail/gpu_types.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
+#define __ARCH_HSAIL_GPU_TYPES_HH__
+
+#include <cstdint>
+
+namespace Brig
+{
+    class BrigInstBase;
+}
+
+class BrigObject;
+
+namespace HsailISA
+{
+    // A raw machine instruction represents the raw bits that
+    // our model uses to represent an actual instruction. In
+    // the case of HSAIL this is just an index into a list of
+    // instruction objects.
+    typedef uint64_t RawMachInst;
+
+    // The MachInst is a representation of an instruction
+    // that has more information than just the machine code.
+    // For HSAIL the actual machine code is a BrigInstBase
+    // and the BrigObject contains more pertinent
+    // information related to operaands, etc.
+
+    struct MachInst
+    {
+        const Brig::BrigInstBase *brigInstBase;
+        const BrigObject *brigObj;
+    };
+}
+
+#endif // __ARCH_HSAIL_GPU_TYPES_HH__
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+    GPUStaticInst*
+    decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // register operand.
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrnIndirectInst(ib, obj);
+        } else {
+            return new BrnDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new CbrIndirectInst(ib, obj);
+        } else {
+            return new CbrDirectInst(ib, obj);
+        }
+    }
+
+    GPUStaticInst*
+    decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        // Detect direct vs indirect branch by seeing whether we have a
+        // second register operand (after the condition).
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+        if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            return new BrIndirectInst(ib, obj);
+        } else {
+            return new BrDirectInst(ib, obj);
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+    // The main difference between a direct branch and an indirect branch
+    // is whether the target is a register or a label, so we can share a
+    // lot of code if we template the base implementation on that type.
+    template<typename TargetType>
+    class BrnInstBase : public HsailGPUStaticInst
+    {
+    public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        TargetType target;
+
+        BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "brn")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr*)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc()  override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+
+        bool isSrcOperand(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return true;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+
+        int getNumOperands() {
+            return 1;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrnDirectInst : public BrnInstBase<LabelOperand>
+    {
+      public:
+        BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrnIndirectInst : public BrnInstBase<SRegOperand>
+    {
+      public:
+        BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrnInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class CbrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        Brig::BrigWidth8_t width;
+        CRegOperand cond;
+        TargetType target;
+
+        CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "cbr")
+        {
+            o_type = Enums::OT_BRANCH;
+            width = ((Brig::BrigInstBr *)ib)->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            cond.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        // Assumption: Target is operand 0, Condition Register is operand 1
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isVectorRegister();
+            else
+                return false;
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.isCondRegister();
+            else
+                return true;
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return target.isScalarRegister();
+            else
+                return false;
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == 0)
+                return true;
+            return false;
+        }
+        // both Condition Register and Target are source operands
+        bool isDstOperand(int operandIndex) {
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.opSize();
+            else
+                return 1;
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            if (!operandIndex)
+                return target.regIndex();
+            else
+                return -1;
+         }
+
+        // Operands = Target, Condition Register
+        int getNumOperands() {
+            return 2;
+        }
+    };
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width != 1) {
+            widthClause = csprintf("_width(%d)", width);
+        }
+
+        disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+                               cond.disassemble(), target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const uint32_t curr_pc = w->pc();
+        const uint32_t curr_rpc = w->rpc();
+        const VectorMask curr_mask = w->execMask();
+
+        /**
+         * TODO: can we move this pop outside the instruction, and
+         * into the wavefront?
+         */
+        w->popFromReconvergenceStack();
+
+        // immediate post-dominator instruction
+        const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+        if (curr_rpc != rpc) {
+            w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+        }
+
+        // taken branch
+        const uint32_t true_pc = getTargetPc();
+        VectorMask true_mask;
+        for (unsigned int lane = 0; lane < VSZ; ++lane) {
+            true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+        }
+
+        // not taken branch
+        const uint32_t false_pc = curr_pc + 1;
+        assert(true_pc != false_pc);
+        if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+            VectorMask false_mask = curr_mask & ~true_mask;
+            w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+        }
+
+        if (true_pc != rpc && true_mask.count()) {
+            w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+        }
+        assert(w->pc() != curr_pc);
+        w->discardFetch();
+    }
+
+
+    class CbrDirectInst : public CbrInstBase<LabelOperand>
+    {
+      public:
+        CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+        // the source operand of a conditional branch is a Condition
+        // Register which is not stored in the VRF
+        // so we do not count it as a source-register operand
+        // even though, formally, it is one.
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class CbrIndirectInst : public CbrInstBase<SRegOperand>
+    {
+      public:
+        CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : CbrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        // one source operand of the conditional indirect branch is a Condition
+        // register which is not stored in the VRF so we do not count it
+        // as a source-register operand even though, formally, it is one.
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj);
+
+    template<typename TargetType>
+    class BrInstBase : public HsailGPUStaticInst
+    {
+      public:
+        void generateDisassembly();
+
+        ImmOperand<uint32_t> width;
+        TargetType target;
+
+        BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : HsailGPUStaticInst(obj, "br")
+        {
+            o_type = Enums::OT_BRANCH;
+            width.init(((Brig::BrigInstBr *)ib)->width, obj);
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            target.init(op_offs, obj);
+            o_type = Enums::OT_BRANCH;
+        }
+
+        uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+        bool unconditionalJumpInstruction() override { return true; }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+        bool isVectorRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return target.regIndex();
+        }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::generateDisassembly()
+    {
+        std::string widthClause;
+
+        if (width.bits != 1) {
+            widthClause = csprintf("_width(%d)", width.bits);
+        }
+
+        disassembly = csprintf("%s%s %s", opcode, widthClause,
+                               target.disassemble());
+    }
+
+    template<typename TargetType>
+    void
+    BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        if (getTargetPc() == w->rpc()) {
+            w->popFromReconvergenceStack();
+        } else {
+            // Rpc and execution mask remain the same
+            w->pc(getTargetPc());
+        }
+        w->discardFetch();
+    }
+
+    class BrDirectInst : public BrInstBase<LabelOperand>
+    {
+      public:
+        BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<LabelOperand>(ib, obj)
+        {
+        }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+    };
+
+    class BrIndirectInst : public BrInstBase<SRegOperand>
+    {
+      public:
+        BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : BrInstBase<SRegOperand>(ib, obj)
+        {
+        }
+        int numSrcRegOperands() { return target.isVectorRegister(); }
+        int numDstRegOperands() { return 0; }
+    };
+
+    GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+    template<typename _DestOperand, typename _SrcOperand>
+    class HsailOperandType
+    {
+      public:
+        typedef _DestOperand DestOperand;
+        typedef _SrcOperand SrcOperand;
+    };
+
+    typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+    typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+    typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+    // The IsBits parameter serves only to disambiguate tbhe B* types from
+    // the U* types, which otherwise would be identical (and
+    // indistinguishable).
+    template<typename _OperandType, typename _CType, Enums::MemType _memType,
+             vgpr_type _vgprType, int IsBits=0>
+    class HsailDataType
+    {
+      public:
+        typedef _OperandType OperandType;
+        typedef _CType CType;
+        static const Enums::MemType memType = _memType;
+        static const vgpr_type vgprType = _vgprType;
+        static const char *label;
+    };
+
+    typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+    typedef HsailDataType<SRegOperandType, uint16_t,
+                          Enums::M_U16, VT_32, 1> B16;
+
+    typedef HsailDataType<SRegOperandType, uint32_t,
+                          Enums::M_U32, VT_32, 1> B32;
+
+    typedef HsailDataType<DRegOperandType, uint64_t,
+                          Enums::M_U64, VT_64, 1> B64;
+
+    typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+    typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+    typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+    typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+    typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+    typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+    typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+    typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+    typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+    typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+    template<typename DestOperandType, typename SrcOperandType,
+             int NumSrcOperands>
+    class CommonInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+                                   dest.disassemble());
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                disassembly += ",";
+                disassembly += src[i].disassemble();
+            }
+        }
+
+        virtual std::string opcode_suffix() = 0;
+
+      public:
+        CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+            dest.init(op_offs, obj);
+
+            for (int i = 0; i < NumSrcOperands; ++i) {
+                op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                src[i].init(op_offs, obj);
+            }
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            return false;
+        }
+
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= NumSrcOperands)
+                return true;
+            return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].regIndex();
+            else
+                return dest.regIndex();
+        }
+        int numSrcRegOperands() {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return NumSrcOperands + 1; }
+    };
+
+    template<typename DataType, int NumSrcOperands>
+    class ArithInst : public CommonInstBase<typename DataType::OperandType,
+                                            typename DataType::OperandType,
+                                            NumSrcOperands>
+    {
+      public:
+        std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+        ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : CommonInstBase<typename DataType::OperandType,
+                             typename DataType::OperandType,
+                             NumSrcOperands>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType, typename Src2OperandType>
+    class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand  src0;
+        typename Src1OperandType::SrcOperand  src1;
+        typename Src2OperandType::SrcOperand  src2;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble(),
+                                   src2.disassemble());
+        }
+
+      public:
+        ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                      const BrigObject *obj,
+                                      const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 3);
+            src2.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else if (operandIndex == 2)
+                return src2.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else if (operandIndex == 2)
+                return src2.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else if (operandIndex == 2)
+                return src2.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 3)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 3)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else if (operandIndex == 2)
+                return src2.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else if (operandIndex == 2)
+                return src2.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src2.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 4; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType, typename Src2DataType>
+    class ThreeNonUniformSourceInst :
+        public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                             typename Src0DataType::OperandType,
+                                             typename Src1DataType::OperandType,
+                                             typename Src2DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+        typedef typename Src2DataType::CType Src2CType;
+
+        ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                  const BrigObject *obj, const char *opcode)
+            : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType,
+                                         typename Src2DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+                                                      DataType, DataType>
+    {
+      public:
+        CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                 const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, B1, DataType,
+                                        DataType>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+                                                               DataType, U32,
+                                                               U32>
+    {
+      public:
+        ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                          const char *opcode)
+            : ThreeNonUniformSourceInst<DataType, DataType, U32,
+                                        U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType, typename Src0OperandType,
+             typename Src1OperandType>
+    class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+        typename Src0OperandType::SrcOperand src0;
+        typename Src1OperandType::SrcOperand src1;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble(), src1.disassemble());
+        }
+
+
+      public:
+        TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+                                    const BrigObject *obj, const char *opcode)
+            : HsailGPUStaticInst(obj, opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isVectorRegister();
+            else if (operandIndex == 1)
+                return src1.isVectorRegister();
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isCondRegister();
+            else if (operandIndex == 1)
+                return src1.isCondRegister();
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.isScalarRegister();
+            else if (operandIndex == 1)
+                return src1.isScalarRegister();
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < 2)
+                return true;
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex >= 2)
+                return true;
+            else
+                return false;
+        }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.opSize();
+            else if (operandIndex == 1)
+                return src1.opSize();
+            else
+                return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (!operandIndex)
+                return src0.regIndex();
+            else if (operandIndex == 1)
+                return src1.regIndex();
+            else
+                return dest.regIndex();
+        }
+
+        int numSrcRegOperands() {
+            int operands = 0;
+            if (src0.isVectorRegister() == true) {
+                operands++;
+            }
+            if (src1.isVectorRegister() == true) {
+                operands++;
+            }
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 3; }
+    };
+
+    template<typename DestDataType, typename Src0DataType,
+             typename Src1DataType>
+    class TwoNonUniformSourceInst :
+        public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                           typename Src0DataType::OperandType,
+                                           typename Src1DataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+        typedef typename Src0DataType::CType Src0CType;
+        typedef typename Src1DataType::CType Src1CType;
+
+        TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+                                const BrigObject *obj, const char *opcode)
+            : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+                                         typename Src0DataType::OperandType,
+                                         typename Src1DataType::OperandType>(ib,
+                                                                    obj, opcode)
+        {
+        }
+    };
+
+    // helper function for ClassInst
+    template<typename T>
+    bool
+    fpclassify(T src0, uint32_t src1)
+    {
+        int fpclass = std::fpclassify(src0);
+
+        if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+            return true;
+        }
+
+        if (src0 <= -0.0) {
+            if ((src1 & 0x4) && fpclass == FP_INFINITE)
+                return true;
+            if ((src1 & 0x8) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x20) && fpclass == FP_ZERO)
+                return true;
+        } else {
+            if ((src1 & 0x40) && fpclass == FP_ZERO)
+                return true;
+            if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+                return true;
+            if ((src1 & 0x100) && fpclass == FP_NORMAL)
+                return true;
+            if ((src1 & 0x200) && fpclass == FP_INFINITE)
+                return true;
+        }
+        return false;
+    }
+
+    template<typename DataType>
+    class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+    {
+      public:
+        ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    template<typename DataType>
+    class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+    {
+      public:
+        ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                  const char *opcode)
+            : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+        {
+        }
+    };
+
+    // helper function for CmpInst
+    template<typename T>
+    bool
+    compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+          case BRIG_COMPARE_EQU:
+          case BRIG_COMPARE_SEQ:
+          case BRIG_COMPARE_SEQU:
+            return (src0 == src1);
+
+          case BRIG_COMPARE_NE:
+          case BRIG_COMPARE_NEU:
+          case BRIG_COMPARE_SNE:
+          case BRIG_COMPARE_SNEU:
+            return (src0 != src1);
+
+          case BRIG_COMPARE_LT:
+          case BRIG_COMPARE_LTU:
+          case BRIG_COMPARE_SLT:
+          case BRIG_COMPARE_SLTU:
+            return (src0 < src1);
+
+          case BRIG_COMPARE_LE:
+          case BRIG_COMPARE_LEU:
+          case BRIG_COMPARE_SLE:
+          case BRIG_COMPARE_SLEU:
+            return (src0 <= src1);
+
+          case BRIG_COMPARE_GT:
+          case BRIG_COMPARE_GTU:
+          case BRIG_COMPARE_SGT:
+          case BRIG_COMPARE_SGTU:
+            return (src0 > src1);
+
+          case BRIG_COMPARE_GE:
+          case BRIG_COMPARE_GEU:
+          case BRIG_COMPARE_SGE:
+          case BRIG_COMPARE_SGEU:
+            return (src0 >= src1);
+
+          case BRIG_COMPARE_NUM:
+          case BRIG_COMPARE_SNUM:
+            return (src0 == src0) || (src1 == src1);
+
+          case BRIG_COMPARE_NAN:
+          case BRIG_COMPARE_SNAN:
+            return (src0 != src0) || (src1 != src1);
+
+          default:
+            fatal("Bad cmpOp value %d\n", (int)cmpOp);
+        }
+    }
+
+    template<typename T>
+    int32_t
+    firstbit(T src0)
+    {
+        if (!src0)
+            return -1;
+
+        //handle positive and negative numbers
+        T tmp = (src0 < 0) ? (~src0) : (src0);
+
+        //the starting pos is MSB
+        int pos = 8 * sizeof(T) - 1;
+        int cnt = 0;
+
+        //search the first bit set to 1
+        while (!(tmp & (1 << pos))) {
+            ++cnt;
+            --pos;
+        }
+        return cnt;
+    }
+
+    const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+    template<typename DestOperandType, typename SrcOperandType>
+    class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+                                              2>
+    {
+      protected:
+        Brig::BrigCompareOperation cmpOp;
+
+      public:
+        CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+            : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+                                                                 _opcode)
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+            Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+            cmpOp = (Brig::BrigCompareOperation)i->compare;
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+                                       typename SrcDataType::OperandType>
+    {
+      public:
+        std::string
+        opcode_suffix()
+        {
+            return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+                            DestDataType::label, SrcDataType::label);
+        }
+
+        CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CmpInstBase<typename DestDataType::OperandType,
+                          typename SrcDataType::OperandType>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    template<typename DestDataType, typename SrcDataType>
+    class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+                                          typename SrcDataType::OperandType, 1>
+    {
+      public:
+        std::string opcode_suffix()
+        {
+            return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+        }
+
+        CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                const char *_opcode)
+            : CommonInstBase<typename DestDataType::OperandType,
+                             typename SrcDataType::OperandType,
+                             1>(ib, obj, _opcode)
+        {
+        }
+    };
+
+    class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+    {
+      public:
+        SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+                               const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex) { return 0; }
+        int getRegisterIndex(int operandIndex) { return -1; }
+
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 0; }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInstNoSrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typename DestOperandType::DestOperand dest;
+
+        void generateDisassembly()
+        {
+            disassembly = csprintf("%s %s", opcode, dest.disassemble());
+        }
+
+      public:
+        SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+                             const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInstNoSrc :
+        public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                         const char *_opcode)
+            : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                       _opcode)
+        {
+        }
+    };
+
+    template<typename DestOperandType>
+    class SpecialInst1SrcBase : public HsailGPUStaticInst
+    {
+      protected:
+        typedef int SrcCType;  // used in execute() template
+
+        typename DestOperandType::DestOperand dest;
+        ImmOperand<SrcCType> src0;
+
+        void
+        generateDisassembly()
+        {
+            disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+                                   src0.disassemble());
+        }
+
+      public:
+        SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+                            const BrigObject *obj, const char *_opcode)
+            : HsailGPUStaticInst(obj, _opcode)
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+        }
+        bool isVectorRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return true; }
+        int getOperandSize(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.opSize();
+        }
+        int getRegisterIndex(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return dest.regIndex();
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands() { return 1; }
+    };
+
+    template<typename DestDataType>
+    class SpecialInst1Src :
+        public SpecialInst1SrcBase<typename DestDataType::OperandType>
+    {
+      public:
+        typedef typename DestDataType::CType DestCType;
+
+        SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+                                                                      _opcode)
+        {
+        }
+    };
+
+    class Ret : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+           : Base(ib, obj, "ret")
+        {
+            o_type = Enums::OT_RET;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class Barrier : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+        uint8_t width;
+
+        Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "barrier")
+        {
+            o_type = Enums::OT_BARRIER;
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+            width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    class MemFence : public SpecialInstNoSrcNoDest
+    {
+      public:
+        typedef SpecialInstNoSrcNoDest Base;
+
+        Brig::BrigMemoryOrder memFenceMemOrder;
+        Brig::BrigMemoryScope memFenceScopeSegGroup;
+        Brig::BrigMemoryScope memFenceScopeSegGlobal;
+        Brig::BrigMemoryScope memFenceScopeSegImage;
+
+        MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : Base(ib, obj, "memfence")
+        {
+            assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+            memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+            memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+            memFenceScopeSegImage = (Brig::BrigMemoryScope)
+                ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+            memFenceMemOrder = (Brig::BrigMemoryOrder)
+                ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+            // set o_type based on scopes
+            if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+                memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_BOTH_MEMFENCE;
+            } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_GLOBAL_MEMFENCE;
+            } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+                o_type = Enums::OT_SHARED_MEMFENCE;
+            } else {
+                fatal("MemFence constructor: bad scope specifiers\n");
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *wave = gpuDynInst->wavefront();
+            wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+        }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+            // 2 cases:
+            //   * memfence to a sequentially consistent memory (e.g., LDS).
+            //     These can be handled as no-ops.
+            //   * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+            //     etc.). We send a packet, tagged with the memory order and
+            //     scope, and let the GPU coalescer handle it.
+
+            if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+                o_type == Enums::OT_BOTH_MEMFENCE) {
+                gpuDynInst->simdId = w->simdId;
+                gpuDynInst->wfSlotId = w->wfSlotId;
+                gpuDynInst->wfDynId = w->wfDynId;
+                gpuDynInst->kern_id = w->kern_id;
+                gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+                gpuDynInst->memoryOrder =
+                    getGenericMemoryOrder(memFenceMemOrder);
+                gpuDynInst->scope =
+                    getGenericMemoryScope(memFenceScopeSegGlobal);
+                gpuDynInst->useContinuation = false;
+                GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+                gmp->getGMReqFIFO().push(gpuDynInst);
+
+                w->wr_gm_reqs_in_pipe--;
+                w->rd_gm_reqs_in_pipe--;
+                w->mem_reqs_in_pipe--;
+                w->outstanding_reqs++;
+            } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+                // no-op
+            } else {
+                fatal("MemFence execute: bad o_type\n");
+            }
+        }
+    };
+
+    class Call : public HsailGPUStaticInst
+    {
+      public:
+        // private helper functions
+        void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+        void
+        generateDisassembly()
+        {
+            if (dest.disassemble() == "") {
+                disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+                                       src1.disassemble());
+            } else {
+                disassembly = csprintf("%s %s (%s) (%s)", opcode,
+                                       src0.disassemble(), dest.disassemble(),
+                                       src1.disassemble());
+            }
+        }
+
+        bool
+        isPseudoOp()
+        {
+            std::string func_name = src0.disassemble();
+            if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+                return true;
+            }
+            return false;
+        }
+
+        // member variables
+        ListOperand dest;
+        FunctionRefOperand src0;
+        ListOperand src1;
+        HsailCode *func_ptr;
+
+        // exec function for pseudo instructions mapped on top of call opcode
+        void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        // user-defined pseudo instructions
+        void MagicPrintLane(Wavefront *w);
+        void MagicPrintLane64(Wavefront *w);
+        void MagicPrintWF32(Wavefront *w);
+        void MagicPrintWF64(Wavefront *w);
+        void MagicPrintWFFloat(Wavefront *w);
+        void MagicSimBreak(Wavefront *w);
+        void MagicPrefixSum(Wavefront *w);
+        void MagicReduction(Wavefront *w);
+        void MagicMaskLower(Wavefront *w);
+        void MagicMaskUpper(Wavefront *w);
+        void MagicJoinWFBar(Wavefront *w);
+        void MagicWaitWFBar(Wavefront *w);
+        void MagicPanic(Wavefront *w);
+
+        void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+                                          GPUDynInstPtr gpuDynInst);
+
+        void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+                                         GPUDynInstPtr gpuDynInst);
+
+        void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+        void MagicXactCasLd(Wavefront *w);
+        void MagicMostSigThread(Wavefront *w);
+        void MagicMostSigBroadcast(Wavefront *w);
+
+        void MagicPrintWF32ID(Wavefront *w);
+        void MagicPrintWFID64(Wavefront *w);
+
+        Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+            : HsailGPUStaticInst(obj, "call")
+        {
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src0.init(op_offs, obj);
+
+            func_ptr = nullptr;
+            std::string func_name = src0.disassemble();
+            if (!isPseudoOp()) {
+                func_ptr = dynamic_cast<HsailCode*>(obj->
+                                                    getFunction(func_name));
+
+                if (!func_ptr)
+                    fatal("call::exec cannot find function: %s\n", func_name);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 2);
+            src1.init(op_offs, obj);
+        }
+
+        bool isVectorRegister(int operandIndex) { return false; }
+        bool isCondRegister(int operandIndex) { return false; }
+        bool isScalarRegister(int operandIndex) { return false; }
+        bool isSrcOperand(int operandIndex) { return false; }
+        bool isDstOperand(int operandIndex) { return false; }
+        int  getOperandSize(int operandIndex) { return 0; }
+        int  getRegisterIndex(int operandIndex) { return -1; }
+
+        void
+        execute(GPUDynInstPtr gpuDynInst)
+        {
+            Wavefront *w = gpuDynInst->wavefront();
+
+            std::string func_name = src0.disassemble();
+            if (isPseudoOp()) {
+                execPseudoInst(w, gpuDynInst);
+            } else {
+                fatal("Native HSAIL functions are not yet implemented: %s\n",
+                      func_name);
+            }
+        }
+        int numSrcRegOperands() { return 0; }
+        int numDstRegOperands() { return 0; }
+        int getNumOperands() { return 2; }
+    };
+
+    template<typename T> T heynot(T arg) { return ~arg; }
+    template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+    HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+                                           const std::string &opcode)
+        : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+    {
+    }
+
+    void
+    HsailGPUStaticInst::generateDisassembly()
+    {
+        disassembly = opcode;
+    }
+
+    const std::string&
+    HsailGPUStaticInst::disassemble()
+    {
+        if (disassembly.empty()) {
+            generateDisassembly();
+            assert(!disassembly.empty());
+        }
+
+        return disassembly;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+    class HsailGPUStaticInst : public GPUStaticInst
+    {
+      public:
+        HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+        void generateDisassembly();
+        const std::string &disassemble();
+        uint32_t instSize() { return 4; }
+
+      protected:
+        HsailCode *hsailCode;
+    };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+    template<> const char *B1::label = "b1";
+    template<> const char *B8::label = "b8";
+    template<> const char *B16::label = "b16";
+    template<> const char *B32::label = "b32";
+    template<> const char *B64::label = "b64";
+
+    template<> const char *S8::label = "s8";
+    template<> const char *S16::label = "s16";
+    template<> const char *S32::label = "s32";
+    template<> const char *S64::label = "s64";
+
+    template<> const char *U8::label = "u8";
+    template<> const char *U16::label = "u16";
+    template<> const char *U32::label = "u32";
+    template<> const char *U64::label = "u64";
+
+    template<> const char *F32::label = "f32";
+    template<> const char *F64::label = "f64";
+
+    const char*
+    cmpOpToString(Brig::BrigCompareOperation cmpOp)
+    {
+        using namespace Brig;
+
+        switch (cmpOp) {
+          case BRIG_COMPARE_EQ:
+            return "eq";
+          case BRIG_COMPARE_NE:
+            return "ne";
+          case BRIG_COMPARE_LT:
+            return "lt";
+          case BRIG_COMPARE_LE:
+            return "le";
+          case BRIG_COMPARE_GT:
+            return "gt";
+          case BRIG_COMPARE_GE:
+            return "ge";
+          case BRIG_COMPARE_EQU:
+            return "equ";
+          case BRIG_COMPARE_NEU:
+            return "neu";
+          case BRIG_COMPARE_LTU:
+            return "ltu";
+          case BRIG_COMPARE_LEU:
+            return "leu";
+          case BRIG_COMPARE_GTU:
+            return "gtu";
+          case BRIG_COMPARE_GEU:
+            return "geu";
+          case BRIG_COMPARE_NUM:
+            return "num";
+          case BRIG_COMPARE_NAN:
+            return "nan";
+          case BRIG_COMPARE_SEQ:
+            return "seq";
+          case BRIG_COMPARE_SNE:
+            return "sne";
+          case BRIG_COMPARE_SLT:
+            return "slt";
+          case BRIG_COMPARE_SLE:
+            return "sle";
+          case BRIG_COMPARE_SGT:
+            return "sgt";
+          case BRIG_COMPARE_SGE:
+            return "sge";
+          case BRIG_COMPARE_SGEU:
+            return "sgeu";
+          case BRIG_COMPARE_SEQU:
+            return "sequ";
+          case BRIG_COMPARE_SNEU:
+            return "sneu";
+          case BRIG_COMPARE_SLTU:
+            return "sltu";
+          case BRIG_COMPARE_SLEU:
+            return "sleu";
+          case BRIG_COMPARE_SNUM:
+            return "snum";
+          case BRIG_COMPARE_SNAN:
+            return "snan";
+          case BRIG_COMPARE_SGTU:
+            return "sgtu";
+          default:
+            return "unknown";
+        }
+    }
+
+    void
+    Ret::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        const VectorMask &mask = w->get_pred();
+
+        // mask off completed work-items
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->init_mask[lane] = 0;
+            }
+
+        }
+
+        // delete extra instructions fetched for completed work-items
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch) {
+            w->dropFetch = true;
+        }
+
+        // if all work-items have completed, then wave-front is done
+        if (w->init_mask.none()) {
+            w->status = Wavefront::S_STOPPED;
+
+            int32_t refCount = w->computeUnit->getLds().
+                                   decreaseRefCounter(w->dispatchid, w->wg_id);
+
+            DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+                            w->computeUnit->cu_id, w->wg_id, refCount);
+
+            // free the vector registers of the completed wavefront
+            w->computeUnit->vectorRegsReserved[w->simdId] -=
+                w->reservedVectorRegs;
+
+            assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+            uint32_t endIndex = (w->startVgprIndex +
+                                 w->reservedVectorRegs - 1) %
+                w->computeUnit->vrf[w->simdId]->numRegs();
+
+            w->computeUnit->vrf[w->simdId]->manager->
+                freeRegion(w->startVgprIndex, endIndex);
+
+            w->reservedVectorRegs = 0;
+            w->startVgprIndex = 0;
+            w->computeUnit->completedWfs++;
+
+            DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+                    w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+            if (!refCount) {
+                // Notify Memory System of Kernel Completion
+                // Kernel End = isKernel + isRelease
+                w->status = Wavefront::S_RETURNING;
+                GPUDynInstPtr local_mempacket = gpuDynInst;
+                local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+                local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+                local_mempacket->useContinuation = false;
+                local_mempacket->simdId = w->simdId;
+                local_mempacket->wfSlotId = w->wfSlotId;
+                local_mempacket->wfDynId = w->wfDynId;
+                w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+            } else {
+                w->computeUnit->shader->dispatcher->scheduleDispatch();
+            }
+        }
+    }
+
+    void
+    Barrier::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        assert(w->barrier_cnt == w->old_barrier_cnt);
+        w->barrier_cnt = w->old_barrier_cnt + 1;
+        w->stalledAtBarrier = true;
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+    const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+    Enums::MemOpType
+    brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+    {
+        if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                return Enums::MO_AAND;
+              case BRIG_ATOMIC_OR:
+                return Enums::MO_AOR;
+              case BRIG_ATOMIC_XOR:
+                return Enums::MO_AXOR;
+              case BRIG_ATOMIC_CAS:
+                return Enums::MO_ACAS;
+              case BRIG_ATOMIC_EXCH:
+                return Enums::MO_AEXCH;
+              case BRIG_ATOMIC_ADD:
+                return Enums::MO_AADD;
+              case BRIG_ATOMIC_WRAPINC:
+                return Enums::MO_AINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                return Enums::MO_ADEC;
+              case BRIG_ATOMIC_MIN:
+                return Enums::MO_AMIN;
+              case BRIG_ATOMIC_MAX:
+                return Enums::MO_AMAX;
+              case BRIG_ATOMIC_SUB:
+                return Enums::MO_ASUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            switch (brigOp) {
+              case BRIG_ATOMIC_AND:
+                  return Enums::MO_ANRAND;
+              case BRIG_ATOMIC_OR:
+                  return Enums::MO_ANROR;
+              case BRIG_ATOMIC_XOR:
+                  return Enums::MO_ANRXOR;
+              case BRIG_ATOMIC_CAS:
+                  return Enums::MO_ANRCAS;
+              case BRIG_ATOMIC_EXCH:
+                  return Enums::MO_ANREXCH;
+              case BRIG_ATOMIC_ADD:
+                  return Enums::MO_ANRADD;
+              case BRIG_ATOMIC_WRAPINC:
+                  return Enums::MO_ANRINC;
+              case BRIG_ATOMIC_WRAPDEC:
+                  return Enums::MO_ANRDEC;
+              case BRIG_ATOMIC_MIN:
+                  return Enums::MO_ANRMIN;
+              case BRIG_ATOMIC_MAX:
+                  return Enums::MO_ANRMAX;
+              case BRIG_ATOMIC_SUB:
+                  return Enums::MO_ANRSUB;
+              default:
+                fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+            }
+        } else {
+            fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+        }
+    }
+
+    const char*
+    atomicOpToString(BrigAtomicOperation brigOp)
+    {
+        switch (brigOp) {
+          case BRIG_ATOMIC_AND:
+            return "and";
+          case BRIG_ATOMIC_OR:
+            return "or";
+          case BRIG_ATOMIC_XOR:
+            return "xor";
+          case BRIG_ATOMIC_CAS:
+            return "cas";
+          case BRIG_ATOMIC_EXCH:
+            return "exch";
+          case BRIG_ATOMIC_ADD:
+            return "add";
+          case BRIG_ATOMIC_WRAPINC:
+            return "inc";
+          case BRIG_ATOMIC_WRAPDEC:
+            return "dec";
+          case BRIG_ATOMIC_MIN:
+            return "min";
+          case BRIG_ATOMIC_MAX:
+            return "max";
+          case BRIG_ATOMIC_SUB:
+            return "sub";
+          default:
+            return "unknown";
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+    class MemInst
+    {
+      public:
+        MemInst() : size(0), addr_operand(nullptr) { }
+
+        MemInst(Enums::MemType m_type)
+        {
+            if (m_type == Enums::M_U64 ||
+                m_type == Enums::M_S64 ||
+                m_type == Enums::M_F64) {
+                size = 8;
+            } else if (m_type == Enums::M_U32 ||
+                       m_type == Enums::M_S32 ||
+                       m_type == Enums::M_F32) {
+                size = 4;
+            } else if (m_type == Enums::M_U16 ||
+                       m_type == Enums::M_S16 ||
+                       m_type == Enums::M_F16) {
+                size = 2;
+            } else {
+                size = 1;
+            }
+
+            addr_operand = nullptr;
+        }
+
+        void
+        init_addr(AddrOperandBase *_addr_operand)
+        {
+            addr_operand = _addr_operand;
+        }
+
+      private:
+        int size;
+        AddrOperandBase *addr_operand;
+
+      public:
+        int getMemOperandSize() { return size; }
+        AddrOperandBase *getAddressOperand() { return addr_operand; }
+    };
+
+    template<typename DestOperandType, typename AddrOperandType>
+    class LdaInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                    const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            dest.init(op_offs, obj);
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex) {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            return 1;
+        }
+    };
+
+    template<typename DestDataType, typename AddrOperandType>
+    class LdaInst :
+        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode)
+            : LdaInstBase<typename DestDataType::OperandType,
+                          AddrOperandType>(ib, obj, _opcode)
+        {
+            init_addr(&this->addr);
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (regDataType.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+              default:
+                fatal("Bad ldas register operand type %d\n", regDataType.type);
+            }
+        } else {
+            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+        }
+    }
+
+    template<typename MemOperandType, typename DestOperandType,
+             typename AddrOperandType>
+    class LdInstBase : public HsailGPUStaticInst
+    {
+      public:
+        Brig::BrigWidth8_t width;
+        typename DestOperandType::DestOperand dest;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigMemoryScope memoryScope;
+        unsigned int equivClass;
+        bool isArgLoad()
+        {
+            return segment == Brig::BRIG_SEGMENT_KERNARG ||
+                   segment == Brig::BRIG_SEGMENT_ARG;
+        }
+        void
+        initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = ldst->width;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_READ;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_READ;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_READ;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_READ;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_READ;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_READ;
+                break;
+
+              case BRIG_SEGMENT_KERNARG:
+                o_type = Enums::OT_KERN_READ;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("Ld: segment %d not supported\n", segment);
+            }
+
+            width = BRIG_WIDTH_1;
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+                dest.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands,1);
+            addr.init(op_offs, obj);
+        }
+
+        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_LD) {
+                initLd(ib, obj, _opcode);
+            } else {
+                initAtomicLd(ib, obj, _opcode);
+            }
+        }
+
+        int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isVectorRegister() :
+                   this->addr.isVectorRegister());
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isCondRegister() :
+                   this->addr.isCondRegister());
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.isScalarRegister() :
+                   this->addr.isScalarRegister());
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex > 0)
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return(operandIndex == 0);
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.opSize() :
+                   this->addr.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return((operandIndex == 0) ? dest.regIndex() :
+                   this->addr.regIndex());
+        }
+    };
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrOperandType>
+    class LdInst :
+        public LdInstBase<typename MemDataType::CType,
+                          typename DestDataType::OperandType, AddrOperandType>,
+        public MemInst
+    {
+        typename DestDataType::OperandType::DestOperand dest_vect[4];
+        uint16_t num_dest_operands;
+        void generateDisassembly();
+
+      public:
+        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+            : LdInstBase<typename MemDataType::CType,
+                         typename DestDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)brigOp;
+
+                num_dest_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_dest_operands <= 4);
+            } else {
+                num_dest_operands = 1;
+            }
+
+            if (num_dest_operands > 1) {
+                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_dest_operands; ++i) {
+                    dest_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_dest_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_dest_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_dest_operands; ++k) {
+
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            // load from shared memory
+                            *d = gpuDynInst->wavefront()->ldsChunk->
+                                read<c0>(vaddr);
+                        } else {
+                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+                            pkt->dataStatic(d);
+
+                            if (gpuDynInst->computeUnit()->shader->
+                                separate_acquire_release &&
+                                gpuDynInst->memoryOrder ==
+                                Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                                // if this load has acquire semantics,
+                                // set the response continuation function
+                                // to perform an Acquire request
+                                gpuDynInst->execContinuation =
+                                    &GPUStaticInst::execLdAcq;
+
+                                gpuDynInst->useContinuation = true;
+                            } else {
+                                // the request will be finished when
+                                // the load completes
+                                gpuDynInst->useContinuation = false;
+                            }
+                            // translation is performed in sendRequest()
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      private:
+        void
+        execLdAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after the load has complete and if the load has acquire
+            // semantics, issue an acquire request.
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+
+      public:
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            if (num_dest_operands > 1) {
+                return dest_vect[operandIndex].isVectorRegister();
+            }
+            else if (num_dest_operands == 1) {
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isVectorRegister();
+            }
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isCondRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isCondRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isScalarRegister());
+            if (num_dest_operands > 1)
+                return dest_vect[operandIndex].isScalarRegister();
+            else if (num_dest_operands == 1)
+                return LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.isVectorRegister());
+            return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return false;
+            return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.opSize());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].opSize());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.opSize());
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if ((num_dest_operands != getNumOperands()) &&
+                (operandIndex == (getNumOperands()-1)))
+                return(this->addr.regIndex());
+            if (num_dest_operands > 1)
+                return(dest_vect[operandIndex].regIndex());
+            else if (num_dest_operands == 1)
+                return(LdInstBase<typename MemDataType::CType,
+                       typename DestDataType::OperandType,
+                       AddrOperandType>::dest.regIndex());
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return(num_dest_operands+1);
+            else
+                return(num_dest_operands);
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename MemDT, typename DestDT>
+    GPUStaticInst*
+    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new LdInst<MemDT, DestDT,
+                                  SRegAddrOperand>(ib, obj, "ld");
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new LdInst<MemDT, DestDT,
+                                  DRegAddrOperand>(ib, obj, "ld");
+              default:
+                fatal("Bad ld register operand type %d\n", tmp.regKind);
+            }
+        } else {
+            fatal("Bad ld register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    template<typename MemDT>
+    GPUStaticInst*
+    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+        switch(dest.regKind) {
+          case Brig::BRIG_REGISTER_KIND_SINGLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+              case Brig::BRIG_TYPE_B16:
+              case Brig::BRIG_TYPE_B32:
+                return decodeLd2<MemDT, B32>(ib, obj);
+              case Brig::BRIG_TYPE_U8:
+              case Brig::BRIG_TYPE_U16:
+              case Brig::BRIG_TYPE_U32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              case Brig::BRIG_TYPE_S8:
+              case Brig::BRIG_TYPE_S16:
+              case Brig::BRIG_TYPE_S32:
+                return decodeLd2<MemDT, S32>(ib, obj);
+              case Brig::BRIG_TYPE_F16:
+              case Brig::BRIG_TYPE_F32:
+                return decodeLd2<MemDT, U32>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          case Brig::BRIG_REGISTER_KIND_DOUBLE:
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B64:
+                return decodeLd2<MemDT, B64>(ib, obj);
+              case Brig::BRIG_TYPE_U64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              case Brig::BRIG_TYPE_S64:
+                return decodeLd2<MemDT, S64>(ib, obj);
+              case Brig::BRIG_TYPE_F64:
+                return decodeLd2<MemDT, U64>(ib, obj);
+              default:
+                fatal("Bad ld register operand type %d, %d\n",
+                      dest.regKind, ib->type);
+            };
+          default:
+            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+                  ib->type);
+        }
+    }
+
+    template<typename MemDataType, typename SrcOperandType,
+             typename AddrOperandType>
+    class StInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename SrcOperandType::SrcOperand src;
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigMemoryOrder memoryOrder;
+        unsigned int equivClass;
+
+        void
+        initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+               const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+            segment = (BrigSegment)ldst->segment;
+            memoryOrder = BRIG_MEMORY_ORDER_NONE;
+            memoryScope = BRIG_MEMORY_SCOPE_NONE;
+            equivClass = ldst->equivClass;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+                (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+                src.init(op_offs, obj);
+            }
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            addr.init(op_offs, obj);
+        }
+
+        void
+        initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                     const char *_opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            equivClass = 0;
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_WRITE;
+                break;
+
+              case BRIG_SEGMENT_PRIVATE:
+                o_type = Enums::OT_PRIVATE_WRITE;
+                break;
+
+              case BRIG_SEGMENT_READONLY:
+                o_type = Enums::OT_READONLY_WRITE;
+                break;
+
+              case BRIG_SEGMENT_SPILL:
+                o_type = Enums::OT_SPILL_WRITE;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_WRITE;
+                break;
+
+              case BRIG_SEGMENT_ARG:
+                o_type = Enums::OT_ARG;
+                break;
+
+              default:
+                panic("St: segment %d not supported\n", segment);
+            }
+
+            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+            addr.init(op_offs, obj);
+
+            op_offs = obj->getOperandPtr(ib->operands, 1);
+            src.init(op_offs, obj);
+        }
+
+        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            if (ib->opcode == BRIG_OPCODE_ST) {
+                initSt(ib, obj, _opcode);
+            } else {
+                initAtomicSt(ib, obj, _opcode);
+            }
+        }
+
+        int numDstRegOperands() { return 0; }
+        int numSrcRegOperands()
+        {
+            return src.isVectorRegister() + this->addr.isVectorRegister();
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return 2;
+            else
+                return 1;
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isVectorRegister() :
+                   this->addr.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isCondRegister() :
+                   this->addr.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.isScalarRegister() :
+                   this->addr.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.opSize() : this->addr.opSize();
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert(operandIndex >= 0 && operandIndex < getNumOperands());
+            return !operandIndex ? src.regIndex() : this->addr.regIndex();
+        }
+    };
+
+
+    template<typename MemDataType, typename SrcDataType,
+             typename AddrOperandType>
+    class StInst :
+        public StInstBase<MemDataType, typename SrcDataType::OperandType,
+                          AddrOperandType>,
+        public MemInst
+    {
+      public:
+        typename SrcDataType::OperandType::SrcOperand src_vect[4];
+        uint16_t num_src_operands;
+        void generateDisassembly();
+
+        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                        const char *_opcode, int srcIdx)
+            : StInstBase<MemDataType, typename SrcDataType::OperandType,
+                         AddrOperandType>(ib, obj, _opcode),
+              MemInst(SrcDataType::memType)
+        {
+            init_addr(&this->addr);
+
+            BrigRegOperandInfo rinfo;
+            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                const Brig::BrigOperandConstantBytes *op =
+                    (Brig::BrigOperandConstantBytes*)baseOp;
+
+                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+                                           Brig::BRIG_TYPE_NONE);
+            } else {
+                rinfo = findRegDataType(op_offs, obj);
+            }
+
+            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+                const Brig::BrigOperandOperandList *brigRegVecOp =
+                    (const Brig::BrigOperandOperandList*)baseOp;
+
+                num_src_operands =
+                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+                assert(num_src_operands <= 4);
+            } else {
+                num_src_operands = 1;
+            }
+
+            if (num_src_operands > 1) {
+                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+                for (int i = 0; i < num_src_operands; ++i) {
+                    src_vect[i].init_from_vect(op_offs, obj, i);
+                }
+            }
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before performing a store, check if this store has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+                    gpuDynInst->useContinuation = true;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, perform stores immediately
+            execSt(gpuDynInst);
+        }
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execSt may be called through a continuation
+        // if the store had release semantics. see comment for
+        // execSt in gpu_static_inst.hh
+        void
+        execSt(GPUDynInstPtr gpuDynInst) override
+        {
+            typedef typename MemDataType::CType c0;
+
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            if (num_src_operands > 1) {
+                for (int i = 0; i < VSZ; ++i)
+                    if (gpuDynInst->exec_mask[i])
+                        gpuDynInst->statusVector.push_back(num_src_operands);
+                    else
+                        gpuDynInst->statusVector.push_back(0);
+            }
+
+            for (int k = 0; k < num_src_operands; ++k) {
+                c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (gpuDynInst->exec_mask[i]) {
+                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+                        if (isLocalMem()) {
+                            //store to shared memory
+                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+                                                                         *d);
+                        } else {
+                            Request *req =
+                              new Request(0, vaddr, sizeof(c0), 0,
+                                          gpuDynInst->computeUnit()->masterId(),
+                                          0, gpuDynInst->wfDynId, i);
+
+                            gpuDynInst->setRequestFlags(req);
+                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+                            pkt->dataStatic<c0>(d);
+
+                            // translation is performed in sendRequest()
+                            // the request will be finished when the store completes
+                            gpuDynInst->useContinuation = false;
+                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+                                                                   i, pkt);
+
+                        }
+                    }
+                    ++d;
+                }
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+      public:
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isVectorRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isVectorRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isVectorRegister();
+            return false;
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isCondRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isCondRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isCondRegister();
+            return false;
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.isScalarRegister();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].isScalarRegister();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.isScalarRegister();
+            return false;
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            return true;
+        }
+        bool isDstOperand(int operandIndex) { return false; }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.opSize();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].opSize();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.opSize();
+            return 0;
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex == num_src_operands)
+                return this->addr.regIndex();
+            if (num_src_operands > 1)
+                return src_vect[operandIndex].regIndex();
+            else if (num_src_operands == 1)
+                return StInstBase<MemDataType,
+                       typename SrcDataType::OperandType,
+                       AddrOperandType>::src.regIndex();
+            return -1;
+        }
+        int getNumOperands()
+        {
+            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+                return num_src_operands + 1;
+            else
+                return num_src_operands;
+        }
+        void execute(GPUDynInstPtr gpuDynInst);
+    };
+
+    template<typename DataType, typename SrcDataType>
+    GPUStaticInst*
+    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        int srcIdx = 0;
+        int destIdx = 1;
+        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+            srcIdx = 1;
+            destIdx = 0;
+        }
+        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return new StInst<DataType, SrcDataType,
+                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                return new StInst<DataType, SrcDataType,
+                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return new StInst<DataType, SrcDataType,
+                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
+              default:
+                fatal("Bad st register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad st register operand kind %d\n", tmp.kind);
+        }
+    }
+
+    Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+                                           Brig::BrigAtomicOperation brigOp);
+
+    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInstBase : public HsailGPUStaticInst
+    {
+      public:
+        typename OperandType::DestOperand dest;
+        typename OperandType::SrcOperand src[NumSrcOperands];
+        AddrOperandType addr;
+
+        Brig::BrigSegment segment;
+        Brig::BrigMemoryOrder memoryOrder;
+        Brig::BrigAtomicOperation atomicOperation;
+        Brig::BrigMemoryScope memoryScope;
+        Brig::BrigOpcode opcode;
+        Enums::MemOpType opType;
+
+        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                       const char *_opcode)
+           : HsailGPUStaticInst(obj, _opcode)
+        {
+            using namespace Brig;
+
+            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+            segment = (BrigSegment)at->segment;
+            memoryScope = (BrigMemoryScope)at->memoryScope;
+            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+            opcode = (BrigOpcode)ib->opcode;
+            opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+            switch (segment) {
+              case BRIG_SEGMENT_GLOBAL:
+                o_type = Enums::OT_GLOBAL_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_GROUP:
+                o_type = Enums::OT_SHARED_ATOMIC;
+                break;
+
+              case BRIG_SEGMENT_FLAT:
+                o_type = Enums::OT_FLAT_ATOMIC;
+                break;
+
+              default:
+                panic("Atomic: segment %d not supported\n", segment);
+            }
+
+            if (HasDst) {
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                dest.init(op_offs, obj);
+
+                op_offs = obj->getOperandPtr(ib->operands, 1);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
+                    src[i].init(op_offs, obj);
+                }
+            } else {
+
+                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+                addr.init(op_offs, obj);
+
+                for (int i = 0; i < NumSrcOperands; ++i) {
+                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
+                    src[i].init(op_offs, obj);
+                }
+            }
+        }
+
+        int numSrcRegOperands()
+        {
+            int operands = 0;
+            for (int i = 0; i < NumSrcOperands; i++) {
+                if (src[i].isVectorRegister() == true) {
+                    operands++;
+                }
+            }
+            if (addr.isVectorRegister())
+                operands++;
+            return operands;
+        }
+        int numDstRegOperands() { return dest.isVectorRegister(); }
+        int getNumOperands()
+        {
+            if (addr.isVectorRegister())
+                return(NumSrcOperands + 2);
+            return(NumSrcOperands + 1);
+        }
+        bool isVectorRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isVectorRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return dest.isVectorRegister();
+        }
+        bool isCondRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isCondRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isCondRegister());
+            else
+                return dest.isCondRegister();
+        }
+        bool isScalarRegister(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return src[operandIndex].isScalarRegister();
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isScalarRegister());
+            else
+                return dest.isScalarRegister();
+        }
+        bool isSrcOperand(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return true;
+            else if (operandIndex == NumSrcOperands)
+                return(addr.isVectorRegister());
+            else
+                return false;
+        }
+        bool isDstOperand(int operandIndex)
+        {
+            if (operandIndex <= NumSrcOperands)
+                return false;
+            else
+                return true;
+        }
+        int getOperandSize(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].opSize());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.opSize());
+            else
+                return(dest.opSize());
+        }
+        int getRegisterIndex(int operandIndex)
+        {
+            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+            if (operandIndex < NumSrcOperands)
+                return(src[operandIndex].regIndex());
+            else if (operandIndex == NumSrcOperands)
+                return(addr.regIndex());
+            else
+                return(dest.regIndex());
+            return -1;
+        }
+    };
+
+    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+             bool HasDst>
+    class AtomicInst :
+        public AtomicInstBase<typename MemDataType::OperandType,
+                              AddrOperandType, NumSrcOperands, HasDst>,
+        public MemInst
+    {
+      public:
+        void generateDisassembly();
+
+        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+                   const char *_opcode)
+            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+                             NumSrcOperands, HasDst>
+                (ib, obj, _opcode),
+              MemInst(MemDataType::memType)
+        {
+            init_addr(&this->addr);
+        }
+
+        void
+        initiateAcc(GPUDynInstPtr gpuDynInst) override
+        {
+            // before doing the RMW, check if this atomic has
+            // release semantics, and if so issue a release first
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                    && (gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+                    Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+                    gpuDynInst->useContinuation = true;
+
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::RELEASE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+                    return;
+                }
+            }
+
+            // if there is no release semantic, execute the RMW immediately
+            execAtomic(gpuDynInst);
+
+        }
+
+        void execute(GPUDynInstPtr gpuDynInst);
+
+        bool
+        isLocalMem() const override
+        {
+            return this->segment == Brig::BRIG_SEGMENT_GROUP;
+        }
+
+      private:
+        // execAtomic may be called through a continuation
+        // if the RMW had release semantics. see comment for
+        // execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomic(GPUDynInstPtr gpuDynInst) override
+        {
+            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+            typedef typename MemDataType::CType c0;
+
+            c0 *d = &((c0*) gpuDynInst->d_data)[0];
+            c0 *e = &((c0*) gpuDynInst->a_data)[0];
+            c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (gpuDynInst->exec_mask[i]) {
+                    Addr vaddr = gpuDynInst->addr[i];
+
+                    if (isLocalMem()) {
+                        Wavefront *wavefront = gpuDynInst->wavefront();
+                        *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+                        switch (this->opType) {
+                          case Enums::MO_AADD:
+                          case Enums::MO_ANRADD:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+                            break;
+                          case Enums::MO_ASUB:
+                          case Enums::MO_ANRSUB:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+                            break;
+                          case Enums::MO_AMAX:
+                          case Enums::MO_ANRMAX:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AMIN:
+                          case Enums::MO_ANRMIN:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
+                            (*e)));
+                            break;
+                          case Enums::MO_AAND:
+                          case Enums::MO_ANRAND:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+                            break;
+                          case Enums::MO_AOR:
+                          case Enums::MO_ANROR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+                            break;
+                          case Enums::MO_AXOR:
+                          case Enums::MO_ANRXOR:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+                            break;
+                          case Enums::MO_AINC:
+                          case Enums::MO_ANRINC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
+                            break;
+                          case Enums::MO_ADEC:
+                          case Enums::MO_ANRDEC:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
+                            break;
+                          case Enums::MO_AEXCH:
+                          case Enums::MO_ANREXCH:
+                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
+                            break;
+                          case Enums::MO_ACAS:
+                          case Enums::MO_ANRCAS:
+                            wavefront->ldsChunk->write<c0>(vaddr,
+                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+                            break;
+                          default:
+                            fatal("Unrecognized or invalid HSAIL atomic op "
+                                  "type.\n");
+                            break;
+                        }
+                    } else {
+                        Request *req =
+                            new Request(0, vaddr, sizeof(c0), 0,
+                                        gpuDynInst->computeUnit()->masterId(),
+                                        0, gpuDynInst->wfDynId, i,
+                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
+                                        f, this->opType));
+
+                        gpuDynInst->setRequestFlags(req);
+                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+                        pkt->dataStatic(d);
+
+                        if (gpuDynInst->computeUnit()->shader->
+                            separate_acquire_release &&
+                            (gpuDynInst->memoryOrder ==
+                             Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+                            // if this atomic has acquire semantics,
+                            // schedule the continuation to perform an
+                            // acquire after the RMW completes
+                            gpuDynInst->execContinuation =
+                                &GPUStaticInst::execAtomicAcq;
+
+                            gpuDynInst->useContinuation = true;
+                        } else {
+                            // the request will be finished when the RMW completes
+                            gpuDynInst->useContinuation = false;
+                        }
+                        // translation is performed in sendRequest()
+                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+                                                               pkt);
+                    }
+                }
+
+                ++d;
+                ++e;
+                ++f;
+            }
+
+            gpuDynInst->updateStats();
+        }
+
+        // execAtomicACq will always be called through a continuation.
+        // see comment for execContinuation in gpu_dyn_inst.hh
+        void
+        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+        {
+            // after performing the RMW, check to see if this instruction
+            // has acquire semantics, and if so, issue an acquire
+            if (!isLocalMem()) {
+                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+                     && gpuDynInst->memoryOrder ==
+                     Enums::MEMORY_ORDER_SC_ACQUIRE) {
+                    gpuDynInst->statusBitVector = VectorMask(1);
+
+                    // the request will be finished when
+                    // the acquire completes
+                    gpuDynInst->useContinuation = false;
+                    // create request
+                    Request *req = new Request(0, 0, 0, 0,
+                                  gpuDynInst->computeUnit()->masterId(),
+                                  0, gpuDynInst->wfDynId, -1);
+                    req->setFlags(Request::ACQUIRE);
+                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+                }
+            }
+        }
+    };
+
+    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+    GPUStaticInst*
+    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+            return decodeLd<DataType>(ib, obj);
+        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+            switch (ib->type) {
+              case Brig::BRIG_TYPE_B8:
+                return decodeSt<S8,S8>(ib, obj);
+              case Brig::BRIG_TYPE_B16:
+                return decodeSt<S8,S16>(ib, obj);
+              case Brig::BRIG_TYPE_B32:
+                return decodeSt<S8,S32>(ib, obj);
+              case Brig::BRIG_TYPE_B64:
+                return decodeSt<S8,S64>(ib, obj);
+              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+            }
+        } else {
+            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, false>(ib, obj, "atomicnoret");
+            else
+                return new AtomicInst<DataType, AddrOperandType,
+                    NumSrcOperands, true>(ib, obj, "atomic");
+        }
+    }
+
+    template<typename DataType, int NumSrcOperands>
+    GPUStaticInst*
+    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+            return constructAtomic<DataType, NoRegAddrOperand,
+                                   NumSrcOperands>(ib, obj);
+        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+            // V2/V4 not allowed
+            switch (tmp.regKind) {
+              case Brig::BRIG_REGISTER_KIND_SINGLE:
+                  return constructAtomic<DataType, SRegAddrOperand,
+                                         NumSrcOperands>(ib, obj);
+              case Brig::BRIG_REGISTER_KIND_DOUBLE:
+                return constructAtomic<DataType, DRegAddrOperand,
+                                       NumSrcOperands>(ib, obj);
+              default:
+                fatal("Bad atomic register operand type %d\n", tmp.type);
+            }
+        } else {
+            fatal("Bad atomic register operand kind %d\n", tmp.kind);
+        }
+    }
+
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+
+    template<typename DataType>
+    GPUStaticInst*
+    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+    {
+        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+            return decodeAtomicHelper<DataType, 2>(ib, obj);
+        } else {
+            return decodeAtomicHelper<DataType, 1>(ib, obj);
+        }
+    }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+                                     DestDataType::label,
+                                     this->dest.disassemble(),
+                                     this->addr.disassemble());
+    }
+
+    template<typename DestDataType, typename AddrRegOperandType>
+    void
+    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename DestDataType::CType CType M5_VAR_USED;
+        const VectorMask &mask = w->get_pred();
+        uint64_t addr_vec[VSZ];
+        this->addr.calcVector(w, addr_vec);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                this->dest.set(w, lane, addr_vec[lane]);
+            }
+        }
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_dest_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         MemDataType::label,
+                                         this->dest_vect[0].disassemble(),
+                                         this->dest_vect[1].disassemble(),
+                                         this->dest_vect[2].disassemble(),
+                                         this->dest_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default:
+            fatal("Bad ld register dest operand, num vector operands: %d \n",
+                  num_dest_operands);
+            break;
+        }
+    }
+
+    static Addr
+    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+    {
+        // what is the size of the object we are accessing??
+        // NOTE: the compiler doesn't generate enough information
+        // to do this yet..have to just line up all the private
+        // work-item spaces back to back for now
+        /*
+        StorageElement* se =
+            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+        assert(se);
+
+        return w->wfSlotId * w->privSizePerItem * VSZ +
+            se->offset * VSZ +
+            lane * se->size;
+        */
+
+        // addressing strategy: interleave the private spaces of
+        // work-items in a wave-front on 8 byte granularity.
+        // this won't be perfect coalescing like the spill space
+        // strategy, but it's better than nothing. The spill space
+        // strategy won't work with private because the same address
+        // may be accessed by different sized loads/stores.
+
+        // Note: I'm assuming that the largest load/store to private
+        // is 8 bytes. If it is larger, the stride will have to increase
+
+        Addr addr_div8 = addr / 8;
+        Addr addr_mod8 = addr % 8;
+
+        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+        assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+        return ret;
+    }
+
+    template<typename MemDataType, typename DestDataType,
+             typename AddrRegOperandType>
+    void
+    LdInst<MemDataType, DestDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename MemDataType::CType MemCType;
+        const VectorMask &mask = w->get_pred();
+
+        // Kernarg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+            MemCType val;
+
+            // I assume no vector ld for kernargs
+            assert(num_dest_operands == 1);
+
+            // assuming for the moment that we'll never do register
+            // offsets into kernarg space... just to make life simpler
+            uint64_t address = this->addr.calcUniform();
+
+            val = *(MemCType*)&w->kernelArgs[address];
+
+            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+                            (unsigned long long)val);
+
+                    this->dest.set(w, lane, val);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = MemDataType::memType;
+        m->v_type = DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (num_dest_operands == 1) {
+            m->dst_reg = this->dest.regIndex();
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_dest_operands;
+            for (int i = 0; i < num_dest_operands; ++i) {
+                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+            }
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (m->addr[lane] < w->privSizePerItem) {
+                    if (mask[lane]) {
+                        // what is the size of the object we are accessing?
+                        // find base for for this wavefront
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_dest_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    //  note: this calculation will NOT WORK if the compiler
+                    //  ever generates loads/stores to the same address with
+                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(MemCType) + w->spillBase;
+
+                        w->last_addr[lane] = m->addr[lane];
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_READONLY:
+            m->s_type = SEG_READONLY;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+                    m->addr[lane] += w->roBase;
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] +
+                            lane * sizeof(MemCType) + w->privBase;
+                    }
+                }
+            }
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Load to unsupported segment %d %llxe\n", this->segment,
+                  m->addr[0]);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *w = gpuDynInst->wavefront();
+
+        typedef typename OperationType::CType CType;
+
+        const VectorMask &mask = w->get_pred();
+
+        // arg references are handled uniquely for now (no Memory Request
+        // is used), so special-case them up front.  Someday we should
+        // make this more realistic, at which we should get rid of this
+        // block and fold this case into the switch below.
+        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+            uint64_t address = this->addr.calcUniform();
+
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    CType data = this->src.template get<CType>(w, lane);
+                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+                    w->writeCallArgMem<CType>(lane, address, data);
+                }
+            }
+
+            return;
+        }
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        m->exec_mask = w->execMask();
+
+        this->addr.calcVector(w, m->addr);
+
+        if (num_src_operands == 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    ((CType*)m->d_data)[lane] =
+                        this->src.template get<CType>(w, lane);
+                }
+            }
+        } else {
+            for (int k= 0; k < num_src_operands; ++k) {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        ((CType*)m->d_data)[k * VSZ + lane] =
+                            this->src_vect[k].template get<CType>(w, lane);
+                    }
+                }
+            }
+        }
+
+        m->m_op = Enums::MO_ST;
+        m->m_type = OperationType::memType;
+        m->v_type = OperationType::vgprType;
+
+        m->statusBitVector = 0;
+        m->equiv = this->equivClass;
+
+        if (num_src_operands == 1) {
+            m->n_reg = 1;
+        } else {
+            m->n_reg = num_src_operands;
+        }
+
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+
+            // this is a complete hack to get around a compiler bug
+            // (the compiler currently generates global access for private
+            //  addresses (starting from 0). We need to add the private offset)
+            for (int lane = 0; lane < VSZ; ++lane) {
+                if (mask[lane]) {
+                    if (m->addr[lane] < w->privSizePerItem) {
+
+                        // calcPrivAddr will fail if accesses are unaligned
+                        assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+                                                     this);
+
+                        m->addr[lane] = privAddr;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_SPILL:
+            assert(num_src_operands == 1);
+            m->s_type = SEG_SPILL;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->spillSizePerItem);
+
+                        m->addr[lane] = m->addr[lane] * w->spillWidth +
+                                        lane * sizeof(CType) + w->spillBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_PRIVATE:
+            m->s_type = SEG_PRIVATE;
+            m->pipeId = GLBMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(1));
+            {
+                for (int lane = 0; lane < VSZ; ++lane) {
+                    if (mask[lane]) {
+                        assert(m->addr[lane] < w->privSizePerItem);
+                        m->addr[lane] = m->addr[lane] + lane *
+                            sizeof(CType)+w->privBase;
+                    }
+                }
+            }
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Store to unsupported segment %d\n", this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    template<typename OperationType, typename SrcDataType,
+             typename AddrRegOperandType>
+    void
+    StInst<OperationType, SrcDataType,
+           AddrRegOperandType>::generateDisassembly()
+    {
+        switch (num_src_operands) {
+          case 1:
+            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src.disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 2:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          case 4:
+            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+                                         this->opcode,
+                                         segmentNames[this->segment],
+                                         OperationType::label,
+                                         this->src_vect[0].disassemble(),
+                                         this->src_vect[1].disassemble(),
+                                         this->src_vect[2].disassemble(),
+                                         this->src_vect[3].disassemble(),
+                                         this->addr.disassemble());
+            break;
+          default: fatal("Bad ld register src operand, num vector operands: "
+                         "%d \n", num_src_operands);
+            break;
+        }
+    }
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+        HasDst>::execute(GPUDynInstPtr gpuDynInst)
+    {
+        typedef typename DataType::CType CType;
+
+        Wavefront *w = gpuDynInst->wavefront();
+
+        GPUDynInstPtr m = gpuDynInst;
+
+        this->addr.calcVector(w, m->addr);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((CType *)m->a_data)[lane] =
+                this->src[0].template get<CType>(w, lane);
+        }
+
+        // load second source operand for CAS
+        if (NumSrcOperands > 1) {
+            for (int lane = 0; lane < VSZ; ++lane) {
+                ((CType*)m->x_data)[lane] =
+                    this->src[1].template get<CType>(w, lane);
+            }
+        }
+
+        assert(NumSrcOperands <= 2);
+
+        m->m_op = this->opType;
+        m->m_type = DataType::memType;
+        m->v_type = DataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+        m->scope = getGenericMemoryScope(this->memoryScope);
+
+        if (HasDst) {
+            m->dst_reg = this->dest.regIndex();
+        }
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->kern_id = w->kern_id;
+        m->cu_id = w->computeUnit->cu_id;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        switch (this->segment) {
+          case Brig::BRIG_SEGMENT_GLOBAL:
+            m->s_type = SEG_GLOBAL;
+            m->latency.set(w->computeUnit->shader->ticks(64));
+            m->pipeId = GLBMEM_PIPE;
+
+            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->outstanding_reqs_wr_gm++;
+            w->wr_gm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_gm++;
+            w->rd_gm_reqs_in_pipe--;
+            break;
+
+          case Brig::BRIG_SEGMENT_GROUP:
+            m->s_type = SEG_SHARED;
+            m->pipeId = LDSMEM_PIPE;
+            m->latency.set(w->computeUnit->shader->ticks(24));
+            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+            w->outstanding_reqs_wr_lm++;
+            w->wr_lm_reqs_in_pipe--;
+            w->outstanding_reqs_rd_lm++;
+            w->rd_lm_reqs_in_pipe--;
+            break;
+
+          default:
+            fatal("Atomic op to unsupported segment %d\n",
+                  this->segment);
+        }
+
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+             bool HasDst>
+    void
+    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+               HasDst>::generateDisassembly()
+    {
+        if (HasDst) {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->dest.disassemble(),
+                         this->addr.disassemble());
+        } else {
+            this->disassembly =
+                csprintf("%s_%s_%s_%s %s", this->opcode,
+                         atomicOpToString(this->atomicOperation),
+                         segmentNames[this->segment],
+                         DataType::label, this->addr.disassemble());
+        }
+
+        for (int i = 0; i < NumSrcOperands; ++i) {
+            this->disassembly += ",";
+            this->disassembly += this->src[i].disassemble();
+        }
+    }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+    // Pseudo (or magic) instructions are overloaded on the hsail call
+    // instruction, because of its flexible parameter signature.
+
+    // To add a new magic instruction:
+    // 1. Add an entry to the enum.
+    // 2. Implement it in the switch statement below (Call::exec).
+    // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+    //    so its easy to call from an OpenCL kernel.
+
+    // This enum should be identical to the enum in
+    // hsa/hsail-gpu-compute/util/magicinst.h
+    enum
+    {
+        MAGIC_PRINT_WF_32 = 0,
+        MAGIC_PRINT_WF_64,
+        MAGIC_PRINT_LANE,
+        MAGIC_PRINT_LANE_64,
+        MAGIC_PRINT_WF_FLOAT,
+        MAGIC_SIM_BREAK,
+        MAGIC_PREF_SUM,
+        MAGIC_REDUCTION,
+        MAGIC_MASKLANE_LOWER,
+        MAGIC_MASKLANE_UPPER,
+        MAGIC_JOIN_WF_BAR,
+        MAGIC_WAIT_WF_BAR,
+        MAGIC_PANIC,
+        MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+        MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+        MAGIC_LOAD_GLOBAL_U32_REG,
+        MAGIC_XACT_CAS_LD,
+        MAGIC_MOST_SIG_THD,
+        MAGIC_MOST_SIG_BROADCAST,
+        MAGIC_PRINT_WFID_32,
+        MAGIC_PRINT_WFID_64
+    };
+
+    void
+    Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        int op = 0;
+        bool got_op = false;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val0 = src1.get<int>(w, lane, 0);
+                if (got_op) {
+                    if (src_val0 != op) {
+                        fatal("Multiple magic instructions per PC not "
+                              "supported\n");
+                    }
+                } else {
+                    op = src_val0;
+                    got_op = true;
+                }
+            }
+        }
+
+        switch(op) {
+          case MAGIC_PRINT_WF_32:
+            MagicPrintWF32(w);
+            break;
+          case MAGIC_PRINT_WF_64:
+            MagicPrintWF64(w);
+            break;
+          case MAGIC_PRINT_LANE:
+            MagicPrintLane(w);
+            break;
+          case MAGIC_PRINT_LANE_64:
+            MagicPrintLane64(w);
+            break;
+          case MAGIC_PRINT_WF_FLOAT:
+            MagicPrintWFFloat(w);
+            break;
+          case MAGIC_SIM_BREAK:
+            MagicSimBreak(w);
+            break;
+          case MAGIC_PREF_SUM:
+            MagicPrefixSum(w);
+            break;
+          case MAGIC_REDUCTION:
+            MagicReduction(w);
+            break;
+          case MAGIC_MASKLANE_LOWER:
+            MagicMaskLower(w);
+            break;
+          case MAGIC_MASKLANE_UPPER:
+            MagicMaskUpper(w);
+            break;
+          case MAGIC_JOIN_WF_BAR:
+            MagicJoinWFBar(w);
+            break;
+          case MAGIC_WAIT_WF_BAR:
+            MagicWaitWFBar(w);
+            break;
+          case MAGIC_PANIC:
+            MagicPanic(w);
+            break;
+
+          // atomic instructions
+          case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+            MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+            MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_LOAD_GLOBAL_U32_REG:
+            MagicLoadGlobalU32Reg(w, gpuDynInst);
+            break;
+
+          case MAGIC_XACT_CAS_LD:
+            MagicXactCasLd(w);
+            break;
+
+          case MAGIC_MOST_SIG_THD:
+            MagicMostSigThread(w);
+            break;
+
+          case MAGIC_MOST_SIG_BROADCAST:
+            MagicMostSigBroadcast(w);
+            break;
+
+          case MAGIC_PRINT_WFID_32:
+            MagicPrintWF32ID(w);
+            break;
+
+          case MAGIC_PRINT_WFID_64:
+            MagicPrintWFID64(w);
+            break;
+
+          default: fatal("unrecognized magic instruction: %d\n", op);
+        }
+    }
+
+    void
+    Call::MagicPrintLane(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintLane64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                if (src_val2) {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                } else {
+                    DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+                             disassemble(), w->computeUnit->cu_id, w->simdId,
+                             w->wfSlotId, lane, src_val1);
+                }
+            }
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF32ID(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%08x", src_val1);
+                } else {
+                    res_str += csprintf("%08d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWF64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFID64(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        int src_val3 = -1;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 3)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+                int src_val2 = src1.get<int>(w, lane, 2);
+                src_val3 = src1.get<int>(w, lane, 3);
+
+                if (src_val2) {
+                    res_str += csprintf("%016x", src_val1);
+                } else {
+                    res_str += csprintf("%016d", src_val1);
+                }
+            } else {
+                res_str += csprintf("xxxxxxxxxxxxxxxx");
+            }
+
+            if ((lane & 3) == 3) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        if (w->wfDynId == src_val3) {
+            DPRINTFN(res_str.c_str());
+        }
+    #endif
+    }
+
+    void
+    Call::MagicPrintWFFloat(Wavefront *w)
+    {
+    #if TRACING_ON
+        const VectorMask &mask = w->get_pred();
+        std::string res_str;
+        res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (!(lane & 7)) {
+                res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+            }
+
+            if (mask[lane]) {
+                float src_val1 = src1.get<float>(w, lane, 1);
+                res_str += csprintf("%08f", src_val1);
+            } else {
+                res_str += csprintf("xxxxxxxx");
+            }
+
+            if ((lane & 7) == 7) {
+                res_str += csprintf("\n");
+            } else {
+                res_str += csprintf(" ");
+            }
+        }
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+    #endif
+    }
+
+    // raises a signal that GDB will catch
+    // when done with the break, type "signal 0" in gdb to continue
+    void
+    Call::MagicSimBreak(Wavefront *w)
+    {
+        std::string res_str;
+        // print out state for this wavefront and then break
+        res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+                           w->wfSlotId);
+
+        res_str += csprintf("  Kern ID: %i\n", w->kern_id);
+        res_str += csprintf("  Phase ID: %i\n", w->simdId);
+        res_str += csprintf("  Executing on CU #%i\n", w->computeUnit->cu_id);
+        res_str += csprintf("  Exec mask: ");
+
+        for (int i = VSZ - 1; i >= 0; --i) {
+            if (w->execMask(i))
+                res_str += "1";
+            else
+                res_str += "0";
+
+            if ((i & 7) == 7)
+                res_str += " ";
+        }
+
+        res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+        res_str += "\nHelpful debugging hints:\n";
+        res_str += "   Check out w->s_reg / w->d_reg for register state\n";
+
+        res_str += "\n\n";
+        DPRINTFN(res_str.c_str());
+        fflush(stdout);
+
+        raise(SIGTRAP);
+    }
+
+    void
+    Call::MagicPrefixSum(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                dest.set<int>(w, lane, res);
+                res += src_val1;
+            }
+        }
+    }
+
+    void
+    Call::MagicReduction(Wavefront *w)
+    {
+        // reduction magic instruction
+        //   The reduction instruction takes up to 64 inputs (one from
+        //   each thread in a WF) and sums them. It returns the sum to
+        //   each thread in the WF.
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                res += src_val1;
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskLower(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane < (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << lane);
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicMaskUpper(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+
+                if (src_val1) {
+                    if (lane >= (VSZ/2)) {
+                        res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+                    }
+                }
+            }
+        }
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+    void
+    Call::MagicJoinWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]++;
+
+                if (w->bar_cnt[lane] > max_cnt) {
+                    max_cnt = w->bar_cnt[lane];
+                }
+            }
+        }
+
+        if (max_cnt > w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+    }
+
+    void
+    Call::MagicWaitWFBar(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int max_cnt = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                w->bar_cnt[lane]--;
+            }
+
+            if (w->bar_cnt[lane] > max_cnt) {
+                max_cnt = w->bar_cnt[lane];
+            }
+        }
+
+        if (max_cnt < w->max_bar_cnt) {
+            w->max_bar_cnt = max_cnt;
+        }
+
+        w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+                                   w->instructionBuffer.end());
+        if (w->pendingFetch)
+            w->dropFetch = true;
+    }
+
+    void
+    Call::MagicPanic(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                int src_val1 = src1.get<int>(w, lane, 1);
+                panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+                      src_val1, lane);
+            }
+        }
+    }
+
+    void
+    Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+    {
+        // the address is in src1 | src2
+        for (int lane = 0; lane < VSZ; ++lane) {
+            int src_val1 = src1.get<int>(w, lane, 1);
+            int src_val2 = src1.get<int>(w, lane, 2);
+            Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+            m->addr[lane] = addr;
+        }
+
+    }
+
+    void
+    Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        calcAddr(w, m);
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+        }
+
+        m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+                                        Brig::BRIG_ATOMIC_ADD);
+        m->m_type = U32::memType;
+        m->v_type = U32::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;  // atomics don't have an equivalence class operand
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(64));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_wr_gm++;
+        w->wr_gm_reqs_in_pipe--;
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+    {
+        GPUDynInstPtr m = gpuDynInst;
+        // calculate the address
+        calcAddr(w, m);
+
+        m->m_op = Enums::MO_LD;
+        m->m_type = U32::memType;  //MemDataType::memType;
+        m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+        m->exec_mask = w->execMask();
+        m->statusBitVector = 0;
+        m->equiv = 0;
+        m->n_reg = 1;
+        m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+        m->scope = Enums::MEMORY_SCOPE_NONE;
+
+        // FIXME
+        //m->dst_reg = this->dest.regIndex();
+
+        m->simdId = w->simdId;
+        m->wfSlotId = w->wfSlotId;
+        m->wfDynId = w->wfDynId;
+        m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+        m->s_type = SEG_GLOBAL;
+        m->pipeId = GLBMEM_PIPE;
+        m->latency.set(w->computeUnit->shader->ticks(1));
+        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->outstanding_reqs_rd_gm++;
+        w->rd_gm_reqs_in_pipe--;
+        w->outstanding_reqs++;
+        w->mem_reqs_in_pipe--;
+    }
+
+    void
+    Call::MagicXactCasLd(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int src_val1 = 0;
+
+        for (int lane = 0; lane < VSZ; ++lane) {
+            if (mask[lane]) {
+                src_val1 = src1.get<int>(w, lane, 1);
+                break;
+            }
+        }
+
+        if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+            w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+            w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+        }
+
+        w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+            .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+    }
+
+    void
+    Call::MagicMostSigThread(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        unsigned mst = true;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                dest.set<int>(w, lane, mst);
+                mst = false;
+            }
+        }
+    }
+
+    void
+    Call::MagicMostSigBroadcast(Wavefront *w)
+    {
+        const VectorMask &mask = w->get_pred();
+        int res = 0;
+        bool got_res = false;
+
+        for (int lane = VSZ - 1; lane >= 0; --lane) {
+            if (mask[lane]) {
+                if (!got_res) {
+                    res = src1.get<int>(w, lane, 1);
+                    got_res = true;
+                }
+                dest.set<int>(w, lane, res);
+            }
+        }
+    }
+
+} // namespace HsailISA
diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc
new file mode 100644
index 000000000..d0e6c5541
--- /dev/null
+++ b/src/arch/hsail/operand.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/operand.hh"
+
+using namespace Brig;
+
+bool
+BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
+        return false;
+
+    const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+ListOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      case BRIG_KIND_OPERAND_CODE_LIST:
+        {
+            const BrigOperandCodeList *opList =
+                (const BrigOperandCodeList*)brigOp;
+
+            const Brig::BrigData *oprnd_data =
+                obj->getBrigBaseData(opList->elements);
+
+            // Note: for calls Dest list of operands could be size of 0.
+            elementCount = oprnd_data->byteCount / 4;
+
+            DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
+                    elementCount);
+
+            for (int i = 0; i < elementCount; ++i) {
+                unsigned *data_offset =
+                    (unsigned*)obj->getData(opList->elements + 4 * (i + 1));
+
+                const BrigDirectiveVariable *p =
+                    (const BrigDirectiveVariable*)obj->
+                    getCodeSectionEntry(*data_offset);
+
+                StorageElement *se = obj->currentCode->storageMap->
+                    findSymbol(BRIG_SEGMENT_ARG, p);
+
+                assert(se);
+                callArgs.push_back(se);
+            }
+        }
+        break;
+      default:
+        fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
+    }
+}
+
+std::string
+ListOperand::disassemble()
+{
+    std::string res_str("");
+
+    for (auto it : callArgs) {
+        res_str += csprintf("%s ", it->name.c_str());
+    }
+
+    return res_str;
+}
+
+void
+FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
+        fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+    const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
+
+    const BrigDirectiveExecutable *p =
+        (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
+
+    func_name = obj->getString(p->name);
+}
+
+std::string
+FunctionRefOperand::disassemble()
+{
+    DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
+
+    return csprintf("%s", func_name);
+}
+
+bool
+BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
+                               int at, unsigned &maxRegIdx, char _regFileChar)
+{
+    regFileChar = _regFileChar;
+    const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
+        return false;
+
+
+    const Brig::BrigOperandOperandList *brigRegVecOp =
+         (const Brig::BrigOperandOperandList*)brigOp;
+
+    unsigned *data_offset =
+        (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
+
+    const BrigOperand *p =
+        (const BrigOperand*)obj->getOperand(*data_offset);
+    if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
+        return false;
+    }
+
+    const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
+
+    regIdx = brigRegOp->regNum;
+
+    DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
+            brigRegOp->regKind);
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+
+    return true;
+}
+
+void
+BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                     unsigned &maxRegIdx, char _regFileChar)
+{
+    const char *name = obj->getString(strOffset);
+    char *endptr;
+    regIdx = strtoul(name + 2, &endptr, 10);
+
+    if (name[0] != '$' || name[1] != _regFileChar) {
+        fatal("register operand parse error on \"%s\"\n", name);
+    }
+
+    maxRegIdx = std::max(maxRegIdx, regIdx);
+}
+
+unsigned SRegOperand::maxRegIdx;
+unsigned DRegOperand::maxRegIdx;
+unsigned CRegOperand::maxRegIdx;
+
+std::string
+SRegOperand::disassemble()
+{
+    return csprintf("$s%d", regIdx);
+}
+
+std::string
+DRegOperand::disassemble()
+{
+    return csprintf("$d%d", regIdx);
+}
+
+std::string
+CRegOperand::disassemble()
+{
+    return csprintf("$c%d", regIdx);
+}
+
+BrigRegOperandInfo
+findRegDataType(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_REGISTER:
+        {
+            const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
+
+            return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
+                                      (BrigRegisterKind)op->regKind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_OPERAND_LIST:
+        {
+             const BrigOperandOperandList *op =
+                (BrigOperandOperandList*)baseOp;
+             const BrigData *data_p = (BrigData*)obj->getData(op->elements);
+
+
+             int num_operands = 0;
+             BrigRegisterKind reg_kind = (BrigRegisterKind)0;
+             for (int offset = 0; offset < data_p->byteCount; offset += 4) {
+                 const BrigOperand *op_p = (const BrigOperand *)
+                    obj->getOperand(((int *)data_p->bytes)[offset/4]);
+
+                 if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
+                     const BrigOperandRegister *brigRegOp =
+                        (const BrigOperandRegister*)op_p;
+                     reg_kind = (BrigRegisterKind)brigRegOp->regKind;
+                 } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+                     uint16_t num_bytes =
+                        ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
+                            - sizeof(BrigBase);
+                     if (num_bytes == sizeof(uint32_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_SINGLE;
+                     } else if (num_bytes == sizeof(uint64_t)) {
+                         reg_kind = BRIG_REGISTER_KIND_DOUBLE;
+                     } else {
+                         fatal("OperandList: bad operand size %d\n", num_bytes);
+                     }
+                 } else {
+                     fatal("OperandList: bad operand kind %d\n", op_p->kind);
+                 }
+
+                 num_operands++;
+             }
+             assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
+
+             return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
+        }
+        break;
+
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+
+            if (!op->reg) {
+                BrigType type = BRIG_TYPE_NONE;
+
+                if (op->symbol) {
+                    const BrigDirective *dir = (BrigDirective*)
+                        obj->getCodeSectionEntry(op->symbol);
+
+                    assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+
+                    const BrigDirectiveVariable *sym =
+                       (const BrigDirectiveVariable*)dir;
+
+                    type = (BrigType)sym->type;
+                }
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
+                                          (BrigType)type);
+            } else {
+                const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
+                const BrigOperand *reg = obj->getOperand(b->reg);
+                const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
+
+                return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
+                                          (BrigRegisterKind)rop->regKind);
+            }
+        }
+        break;
+
+     default:
+       fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
+       break;
+   }
+}
+
+void
+AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
+{
+    assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
+
+    const BrigDirective *d =
+        (BrigDirective*)obj->getCodeSectionEntry(op->symbol);
+
+    assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+    const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
+    name = obj->getString(sym->name);
+
+    if (sym->segment != BRIG_SEGMENT_ARG) {
+        storageElement =
+            obj->currentCode->storageMap->findSymbol(sym->segment, name);
+        assert(storageElement);
+        offset = 0;
+    } else {
+        // sym->name does not work for BRIG_SEGMENT_ARG for the following case:
+        //
+        //     void foo(int a);
+        //     void bar(double a);
+        //
+        //     foo(...) --> arg_u32 %param_p0;
+        //                  st_arg_u32 $s0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //     bar(...) --> arg_f64 %param_p0;
+        //                  st_arg_u64 $d0, [%param_p0];
+        //                  call &foo (%param_p0);
+        //
+        //  Both functions use the same variable name (param_p0)!!!
+        //
+        //  Maybe this is a bug in the compiler (I don't know).
+        //
+        // Solution:
+        // Use directive pointer (BrigDirectiveVariable) to differentiate 2
+        // versions of param_p0.
+        //
+        // Note this solution is kind of stupid, because we are pulling stuff
+        // out of the brig binary via the directive pointer and putting it into
+        // the symbol table, but now we are indexing the symbol table by the
+        // brig directive pointer! It makes the symbol table sort of pointless.
+        // But I don't want to mess with the rest of the infrastructure, so
+        // let's go with this for now.
+        //
+        // When we update the compiler again, we should see if this problem goes
+        // away. If so, we can fold some of this functionality into the code for
+        // kernel arguments. If not, maybe we can index the symbol name on a
+        // hash of the variable AND function name
+        storageElement = obj->currentCode->
+                 storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
+
+        assert(storageElement);
+    }
+}
+
+uint64_t
+AddrOperandBase::calcUniformBase()
+{
+    // start with offset, will be 0 if not specified
+    uint64_t address = offset;
+
+    // add in symbol value if specified
+    if (storageElement) {
+        address += storageElement->offset;
+    }
+
+    return address;
+}
+
+std::string
+AddrOperandBase::disassemble(std::string reg_disassembly)
+{
+    std::string disasm;
+
+    if (offset || reg_disassembly != "") {
+        disasm += "[";
+
+        if (reg_disassembly != "") {
+            disasm += reg_disassembly;
+
+            if (offset > 0) {
+                disasm += "+";
+            }
+        }
+
+        if (offset) {
+            disasm += csprintf("%d", offset);
+        }
+
+        disasm += "]";
+    } else if (name) {
+        disasm += csprintf("[%s]", name);
+    }
+
+    return disasm;
+}
+
+void
+NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
+        BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
+        parseAddr(addrOp, obj);
+        offset = (uint64_t(addrOp->offset.hi) << 32) |
+                  uint64_t(addrOp->offset.lo);
+    } else {
+        fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
+    }
+
+}
+
+std::string
+NoRegAddrOperand::disassemble()
+{
+    return AddrOperandBase::disassemble(std::string(""));
+}
+
+void
+LabelOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+    const BrigOperandCodeRef *op =
+        (const BrigOperandCodeRef*)obj->getOperand(opOffset);
+
+    assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
+
+    const BrigDirective *dir =
+        (const BrigDirective*)obj->getCodeSectionEntry(op->ref);
+
+    assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
+    label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
+}
+
+uint32_t
+LabelOperand::getTarget(Wavefront *w, int lane)
+{
+    return label->get();
+}
+
+std::string
+LabelOperand::disassemble()
+{
+    return label->name;
+}
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
new file mode 100644
index 000000000..e3d275b10
--- /dev/null
+++ b/src/arch/hsail/operand.hh
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_OPERAND_HH__
+#define __ARCH_HSAIL_OPERAND_HH__
+
+/**
+ *  @file operand.hh
+ *
+ *  Defines classes encapsulating HSAIL instruction operands.
+ */
+
+#include <string>
+
+#include "arch/hsail/Brig.h"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUReg.hh"
+#include "enums/RegisterType.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+class Label;
+class StorageElement;
+
+class BaseOperand
+{
+  public:
+    Enums::RegisterType registerType;
+    uint32_t regOperandSize;
+    BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
+    bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
+    bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
+    bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
+    unsigned int regIndex() { return 0; }
+    uint32_t opSize() { return regOperandSize; }
+    virtual ~BaseOperand() { }
+};
+
+class BrigRegOperandInfo
+{
+  public:
+    Brig::BrigKind16_t kind;
+    Brig::BrigType type;
+    Brig::BrigRegisterKind regKind;
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind,
+                       Brig::BrigRegisterKind _regKind)
+        : kind(_kind), regKind(_regKind)
+    {
+    }
+
+    BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
+        : kind(_kind), type(_type)
+    {
+    }
+
+    BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
+                           type(Brig::BRIG_TYPE_NONE)
+    {
+    }
+};
+
+BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
+
+class BaseRegOperand : public BaseOperand
+{
+  public:
+    unsigned regIdx;
+    char regFileChar;
+
+    bool init(unsigned opOffset, const BrigObject *obj,
+              unsigned &maxRegIdx, char _regFileChar);
+
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
+                        unsigned &maxRegIdx, char _regFileChar);
+
+    void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+                           unsigned &maxRegIdx, char _regFileChar);
+    unsigned int regIndex() { return regIdx; }
+};
+
+class SRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              's');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint32_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 's');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint32_t));
+        assert(regIdx < w->maxSpVgprs);
+        // if OperandType is smaller than 32-bit, we truncate the value
+        OperandType ret;
+        uint32_t vgprIdx;
+
+        switch (sizeof(OperandType)) {
+          case 1: // 1 byte operand
+              vgprIdx = w->remap(regIdx, 1, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xff;
+            break;
+          case 2: // 2 byte operand
+              vgprIdx = w->remap(regIdx, 2, 1);
+              ret = (w->computeUnit->vrf[w->simdId]->
+                      read<uint32_t>(vgprIdx, lane)) & 0xffff;
+            break;
+          case 4: // 4 byte operand
+              vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
+              ret = w->computeUnit->vrf[w->simdId]->
+                  read<OperandType>(vgprIdx, lane);
+            break;
+          default:
+            panic("Bad OperandType\n");
+            break;
+        }
+
+        return (OperandType)ret;
+    }
+
+    // special get method for compatibility with LabelOperand
+    uint32_t
+    getTarget(Wavefront *w, int lane)
+    {
+        return get<uint32_t>(w, lane);
+    }
+
+    template<typename OperandType>
+    void set(Wavefront *w, int lane, OperandType &val);
+    std::string disassemble();
+};
+
+template<typename OperandType>
+void
+SRegOperand::set(Wavefront *w, int lane, OperandType &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(sizeof(OperandType) == sizeof(uint32_t));
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+    w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+}
+
+template<>
+inline void
+SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
+{
+    DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+            w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+    assert(regIdx < w->maxSpVgprs);
+    uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
+    w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
+}
+
+class DRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'd');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint64_t);
+        registerType = Enums::RT_VECTOR;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'd');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+
+        return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(sizeof(OperandType) <= sizeof(uint64_t));
+        // TODO: this check is valid only for HSAIL
+        assert(regIdx < w->maxDpVgprs);
+        uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+        w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+    }
+
+    std::string disassemble();
+};
+
+class CRegOperand : public BaseRegOperand
+{
+  public:
+    static unsigned maxRegIdx;
+
+    bool
+    init(unsigned opOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
+    }
+
+    bool
+    init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+                                              'c');
+    }
+
+    void
+    initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+    {
+        regOperandSize = sizeof(uint8_t);
+        registerType = Enums::RT_CONDITION;
+
+        return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+                                                 'c');
+    }
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        assert(regIdx < w->condRegState->numRegs());
+
+        return w->condRegState->read<OperandType>((int)regIdx, lane);
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType &val)
+    {
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+                val);
+
+        assert(regIdx < w->condRegState->numRegs());
+        w->condRegState->write<OperandType>(regIdx,lane,val);
+    }
+
+    std::string disassemble();
+};
+
+template<typename T>
+class ImmOperand : public BaseOperand
+{
+  public:
+    T bits;
+
+    bool init(unsigned opOffset, const BrigObject *obj);
+    bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get()
+    {
+        assert(sizeof(OperandType) <= sizeof(T));
+
+        return *(OperandType*)&bits;
+    }
+
+    // This version of get() takes a WF* and a lane id for
+    // compatibility with the register-based get() methods.
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return get<OperandType>();
+    }
+};
+
+template<typename T>
+bool
+ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    switch (brigOp->kind) {
+      // this is immediate operand
+      case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
+        {
+            DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
+                    brigOp->byteCount);
+
+            auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
+
+            bits = *((T*)(obj->getData(cbptr->bytes + 4)));
+
+            return true;
+        }
+        break;
+
+      case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+        bits = VSZ;
+        return true;
+
+      default:
+        return false;
+    }
+}
+
+template <typename T>
+bool
+ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+{
+    const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+    if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+        return false;
+    }
+
+
+    const Brig::BrigOperandOperandList *brigVecOp =
+         (const Brig::BrigOperandOperandList *)brigOp;
+
+    unsigned *data_offset =
+        (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
+
+    const Brig::BrigOperand *p =
+        (const Brig::BrigOperand *)obj->getOperand(*data_offset);
+
+    if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+        return false;
+    }
+
+    return init(*data_offset, obj);
+}
+template<typename T>
+std::string
+ImmOperand<T>::disassemble()
+{
+    return csprintf("0x%08x", bits);
+}
+
+template<typename RegOperand, typename T>
+class RegOrImmOperand : public BaseOperand
+{
+  private:
+    bool is_imm;
+
+  public:
+    void setImm(const bool value) { is_imm = value; }
+
+    ImmOperand<T> imm_op;
+    RegOperand reg_op;
+
+    RegOrImmOperand() { is_imm = false; }
+    void init(unsigned opOffset, const BrigObject *obj);
+    void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane)
+    {
+        return is_imm ?  imm_op.template get<OperandType>() :
+                         reg_op.template get<OperandType>(w, lane);
+    }
+
+    uint32_t
+    opSize()
+    {
+        if (!is_imm) {
+            return reg_op.opSize();
+        }
+
+        return 0;
+    }
+
+    bool
+    isVectorRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_VECTOR;
+        }
+        return false;
+    }
+
+    bool
+    isCondRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_CONDITION;
+        }
+
+        return false;
+    }
+
+    bool
+    isScalarRegister()
+    {
+        if (!is_imm) {
+            return reg_op.registerType == Enums::RT_SCALAR;
+        }
+
+        return false;
+    }
+
+    unsigned int
+    regIndex()
+    {
+        if (!is_imm) {
+            return reg_op.regIndex();
+        }
+        return 0;
+    }
+};
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
+{
+    is_imm = false;
+
+    if (reg_op.init(opOffset, obj)) {
+        return;
+    }
+
+    if (imm_op.init(opOffset, obj)) {
+        is_imm = true;
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
+                                               const BrigObject *obj, int at)
+{
+    if (reg_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = false;
+
+        return;
+    }
+
+    if (imm_op.init_from_vect(opOffset, obj, at)) {
+        is_imm = true;
+
+        return;
+    }
+
+    fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+          obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+std::string
+RegOrImmOperand<RegOperand, T>::disassemble()
+{
+    return is_imm ? imm_op.disassemble() : reg_op.disassemble();
+}
+
+typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
+typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
+typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
+
+class AddrOperandBase : public BaseOperand
+{
+  protected:
+    // helper function for init()
+    void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
+
+    // helper function for disassemble()
+    std::string disassemble(std::string reg_disassembly);
+    uint64_t calcUniformBase();
+
+  public:
+    virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+    virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
+
+    uint64_t offset;
+    const char *name = nullptr;
+    StorageElement *storageElement;
+};
+
+template<typename RegOperandType>
+class RegAddrOperand : public AddrOperandBase
+{
+  public:
+    RegOperandType reg;
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    uint32_t opSize() { return reg.opSize(); }
+    bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
+    bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
+    bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
+    unsigned int regIndex() { return reg.regIndex(); }
+    std::string disassemble();
+};
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
+{
+    using namespace Brig;
+
+    const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+    switch (baseOp->kind) {
+      case BRIG_KIND_OPERAND_ADDRESS:
+        {
+            const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+            storageElement = nullptr;
+
+            offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
+            reg.init(op->reg, obj);
+
+            if (reg.regFileChar == 's') {
+                reg.regOperandSize = sizeof(uint32_t);
+                registerType = Enums::RT_VECTOR;
+            }
+            else if (reg.regFileChar == 'd') {
+                reg.regOperandSize = sizeof(uint64_t);
+                registerType = Enums::RT_VECTOR;
+            }
+        }
+        break;
+
+      default:
+        fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
+        break;
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcUniform()
+{
+    fatal("can't do calcUniform() on register-based address\n");
+
+    return 0;
+}
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    Addr address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane) {
+        if (w->execMask(lane)) {
+            if (reg.regFileChar == 's') {
+                addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
+            } else {
+                addrVec[lane] = address + reg.template get<Addr>(w, lane);
+            }
+        }
+    }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
+{
+    Addr address = calcUniformBase();
+
+    return address + reg.template get<Addr>(w, lane);
+}
+
+template<typename RegOperandType>
+std::string
+RegAddrOperand<RegOperandType>::disassemble()
+{
+    return AddrOperandBase::disassemble(reg.disassemble());
+}
+
+typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
+typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
+
+class NoRegAddrOperand : public AddrOperandBase
+{
+  public:
+    void init(unsigned opOffset, const BrigObject *obj);
+    uint64_t calcUniform();
+    void calcVector(Wavefront *w, uint64_t *addrVec);
+    uint64_t calcLane(Wavefront *w, int lane=0);
+    std::string disassemble();
+};
+
+inline uint64_t
+NoRegAddrOperand::calcUniform()
+{
+    return AddrOperandBase::calcUniformBase();
+}
+
+inline uint64_t
+NoRegAddrOperand::calcLane(Wavefront *w, int lane)
+{
+    return calcUniform();
+}
+
+inline void
+NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+    uint64_t address = calcUniformBase();
+
+    for (int lane = 0; lane < VSZ; ++lane)
+        addrVec[lane] = address;
+}
+
+class LabelOperand : public BaseOperand
+{
+  public:
+    Label *label;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+
+    // special get method for compatibility with SRegOperand
+    uint32_t getTarget(Wavefront *w, int lane);
+
+};
+
+class ListOperand : public BaseOperand
+{
+  public:
+    int elementCount;
+    std::vector<StorageElement*> callArgs;
+
+    int
+    getSrcOperand(int idx)
+    {
+        DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
+                callArgs.size());
+
+        return callArgs.at(idx)->offset;
+    }
+
+    void init(unsigned opOffset, const BrigObject *obj);
+
+    std::string disassemble();
+
+    template<typename OperandType>
+    OperandType
+    get(Wavefront *w, int lane, int arg_idx)
+    {
+        return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
+    }
+
+    template<typename OperandType>
+    void
+    set(Wavefront *w, int lane, OperandType val)
+    {
+        w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
+        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
+                w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
+                getSrcOperand(0), val);
+    }
+};
+
+class FunctionRefOperand : public BaseOperand
+{
+  public:
+    const char *func_name;
+
+    void init(unsigned opOffset, const BrigObject *obj);
+    std::string disassemble();
+};
+
+#endif // __ARCH_HSAIL_OPERAND_HH__
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+    'PF_CU',
+    'PF_PHASE',
+    'PF_WF',
+    'PF_STRIDE',
+    'PF_END',
+    ]
+
+class VectorRegisterFile(SimObject):
+    type = 'VectorRegisterFile'
+    cxx_class = 'VectorRegisterFile'
+    cxx_header = 'gpu-compute/vector_register_file.hh'
+
+    simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+    num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+    min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+    type = 'Wavefront'
+    cxx_class = 'Wavefront'
+    cxx_header = 'gpu-compute/wavefront.hh'
+
+    simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+    wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+    type = 'ComputeUnit'
+    cxx_class = 'ComputeUnit'
+    cxx_header = 'gpu-compute/compute_unit.hh'
+
+    wavefronts = VectorParam.Wavefront('Number of wavefronts')
+    wfSize = Param.Int(64, 'Wavefront size (in work items)')
+    num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+    spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+                                        'latency')
+
+    dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+                                        'latency')
+
+    issue_period = Param.Int(4, 'number of cycles per issue period')
+    num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+    num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+                                "Represents the pipeline to reach the TCP and "\
+                                "specified in GPU clock cycles")
+    mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+                                 "cu. Represents the pipeline between the TCP "\
+                                 "and cu as well as TCP data array access. "\
+                                 "Specified in GPU clock cycles")
+    system = Param.System(Parent.any, "system object")
+    cu_id = Param.Int('CU id')
+    vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+                                           "in bytes")
+    coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+                                           "in bytes")
+
+    memory_port = VectorMasterPort("Port to the memory system")
+    translation_port = VectorMasterPort('Port to the TLB hierarchy')
+    sqc_port = MasterPort("Port to the SQC (I-cache")
+    sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+    perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+    prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+                               "(0 turns off prefetching)")
+    prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+    prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+                                            "from last mem req in lane of "\
+                                            "CU|Phase|Wavefront")
+    execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+    xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+    debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+    functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+    localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+                                        "kernel end")
+
+    countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+                                   "and how many times")
+    global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+                                      "memory pipeline's queues")
+    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+                                      "memory pipeline's queues")
+    ldsBus = Bridge() # the bridge between the CU and its LDS
+    ldsPort = MasterPort("The port that goes to the LDS")
+    localDataStore = Param.LdsState("the LDS for this CU")
+
+    vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+                                                          "file")
+
+class Shader(ClockedObject):
+    type = 'Shader'
+    cxx_class = 'Shader'
+    cxx_header = 'gpu-compute/shader.hh'
+
+    CUs = VectorParam.ComputeUnit('Number of compute units')
+    n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+                                                  ruby at kernel boundaries""")
+    separate_acquire_release = Param.Bool(False,
+        """Do ld_acquire/st_release generate separate requests for the
+        acquire and release?""")
+    globalmem = Param.MemorySize('64kB', 'Memory size')
+    timing = Param.Bool(False, 'timing memory accesses')
+
+    cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+    translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+    type = 'ClDriver'
+    cxx_header = 'gpu-compute/cl_driver.hh'
+    codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+    type = 'GpuDispatcher'
+    cxx_header = 'gpu-compute/dispatcher.hh'
+    # put at 8GB line for now
+    pio_addr = Param.Addr(0x200000000, "Device Address")
+    pio_latency = Param.Latency('1ns', "Programmed IO latency")
+    shader_pointer = Param.Shader('pointer to shader')
+    translation_port = MasterPort('Port to the dispatcher TLB')
+    cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+    cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+    'OT_NULL',
+    'OT_ALU',
+    'OT_SPECIAL',
+    'OT_GLOBAL_READ',
+    'OT_GLOBAL_WRITE',
+    'OT_GLOBAL_ATOMIC',
+    'OT_GLOBAL_HIST',
+    'OT_GLOBAL_LDAS',
+    'OT_SHARED_READ',
+    'OT_SHARED_WRITE',
+    'OT_SHARED_ATOMIC',
+    'OT_SHARED_HIST',
+    'OT_SHARED_LDAS',
+    'OT_PRIVATE_READ',
+    'OT_PRIVATE_WRITE',
+    'OT_PRIVATE_ATOMIC',
+    'OT_PRIVATE_HIST',
+    'OT_PRIVATE_LDAS',
+    'OT_SPILL_READ',
+    'OT_SPILL_WRITE',
+    'OT_SPILL_ATOMIC',
+    'OT_SPILL_HIST',
+    'OT_SPILL_LDAS',
+    'OT_READONLY_READ',
+    'OT_READONLY_WRITE',
+    'OT_READONLY_ATOMIC',
+    'OT_READONLY_HIST',
+    'OT_READONLY_LDAS',
+    'OT_FLAT_READ',
+    'OT_FLAT_WRITE',
+    'OT_FLAT_ATOMIC',
+    'OT_FLAT_HIST',
+    'OT_FLAT_LDAS',
+    'OT_KERN_READ',
+    'OT_BRANCH',
+
+    # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+    #       of the compiler.
+    'OT_SHARED_MEMFENCE',
+    'OT_GLOBAL_MEMFENCE',
+    'OT_BOTH_MEMFENCE',
+
+    'OT_BARRIER',
+    'OT_PRINT',
+    'OT_RET',
+    'OT_NOP',
+    'OT_ARG'
+    ]
+
+class MemType(Enum): vals = [
+    'M_U8',
+    'M_U16',
+    'M_U32',
+    'M_U64',
+    'M_S8',
+    'M_S16',
+    'M_S32',
+    'M_S64',
+    'M_F16',
+    'M_F32',
+    'M_F64',
+    ]
+
+class MemOpType(Enum): vals = [
+    'MO_LD',
+    'MO_ST',
+    'MO_LDAS',
+    'MO_LDA',
+    'MO_AAND',
+    'MO_AOR',
+    'MO_AXOR',
+    'MO_ACAS',
+    'MO_AEXCH',
+    'MO_AADD',
+    'MO_ASUB',
+    'MO_AINC',
+    'MO_ADEC',
+    'MO_AMAX',
+    'MO_AMIN',
+    'MO_ANRAND',
+    'MO_ANROR',
+    'MO_ANRXOR',
+    'MO_ANRCAS',
+    'MO_ANREXCH',
+    'MO_ANRADD',
+    'MO_ANRSUB',
+    'MO_ANRINC',
+    'MO_ANRDEC',
+    'MO_ANRMAX',
+    'MO_ANRMIN',
+    'MO_HAND',
+    'MO_HOR',
+    'MO_HXOR',
+    'MO_HCAS',
+    'MO_HEXCH',
+    'MO_HADD',
+    'MO_HSUB',
+    'MO_HINC',
+    'MO_HDEC',
+    'MO_HMAX',
+    'MO_HMIN',
+    'MO_UNDEF'
+    ]
+
+class StorageClassType(Enum): vals = [
+    'SC_SPILL',
+    'SC_GLOBAL',
+    'SC_SHARED',
+    'SC_PRIVATE',
+    'SC_READONLY',
+    'SC_KERNARG',
+    'SC_NONE',
+    ]
+
+class RegisterType(Enum): vals = [
+    'RT_VECTOR',
+    'RT_SCALAR',
+    'RT_CONDITION',
+    'RT_HARDWARE',
+    'RT_NONE',
+    ]
+
+class GenericMemoryOrder(Enum): vals = [
+    'MEMORY_ORDER_NONE',
+    'MEMORY_ORDER_RELAXED',
+    'MEMORY_ORDER_SC_ACQUIRE',
+    'MEMORY_ORDER_SC_RELEASE',
+    'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+    ]
+
+class GenericMemoryScope(Enum): vals = [
+    'MEMORY_SCOPE_NONE',
+    'MEMORY_SCOPE_WORKITEM',
+    'MEMORY_SCOPE_WAVEFRONT',
+    'MEMORY_SCOPE_WORKGROUP',
+    'MEMORY_SCOPE_DEVICE',
+    'MEMORY_SCOPE_SYSTEM',
+    ]
diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py
new file mode 100644
index 000000000..6ea9f6427
--- /dev/null
+++ b/src/gpu-compute/LdsState.py
@@ -0,0 +1,51 @@
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Joe Gross
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from MemObject import MemObject
+
+class LdsState(MemObject):
+    type = 'LdsState'
+    cxx_class = 'LdsState'
+    cxx_header = 'gpu-compute/lds_state.hh'
+    size = Param.Int(65536, 'the size of the LDS')
+    range = Param.AddrRange('64kB', "address space of the LDS")
+    bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
+                                    'accessing data')
+    banks = Param.Int(32, 'Number of LDS banks')
+    cuPort = SlavePort("port that goes to the compute unit")
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
new file mode 100644
index 000000000..2de96df24
--- /dev/null
+++ b/src/gpu-compute/SConscript
@@ -0,0 +1,99 @@
+# -*- mode:python -*-
+
+#
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+SimObject('GPU.py')
+SimObject('LdsState.py')
+SimObject('X86GPUTLB.py')
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+    Source('brig_object.cc')
+    Source('hsail_code.cc')
+
+Source('cl_driver.cc')
+Source('compute_unit.cc')
+Source('condition_register_state.cc')
+Source('dispatcher.cc')
+Source('exec_stage.cc')
+Source('fetch_stage.cc')
+Source('fetch_unit.cc')
+Source('global_memory_pipeline.cc')
+Source('gpu_dyn_inst.cc')
+Source('gpu_exec_context.cc')
+Source('gpu_static_inst.cc')
+Source('gpu_tlb.cc')
+Source('hsa_object.cc')
+Source('kernel_cfg.cc')
+Source('lds_state.cc')
+Source('local_memory_pipeline.cc')
+Source('of_scheduling_policy.cc')
+Source('pool_manager.cc')
+Source('rr_scheduling_policy.cc')
+Source('schedule_stage.cc')
+Source('scheduler.cc')
+Source('scoreboard_check_stage.cc')
+Source('shader.cc')
+Source('simple_pool_manager.cc')
+Source('tlb_coalescer.cc')
+Source('vector_register_file.cc')
+Source('vector_register_state.cc')
+Source('wavefront.cc')
+
+DebugFlag('BRIG')
+DebugFlag('GPUCoalescer')
+DebugFlag('GPUDisp')
+DebugFlag('GPUExec')
+DebugFlag('GPUFetch')
+DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUMem')
+DebugFlag('GPUPort')
+DebugFlag('GPUPrefetch')
+DebugFlag('GPUReg')
+DebugFlag('GPUSync')
+DebugFlag('GPUTLB')
+DebugFlag('HSALoader')
+DebugFlag('HSAIL')
+DebugFlag('HSAILObject')
+DebugFlag('Predictor')
+DebugFlag('WavefrontStack')
+
+CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
+                        'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py
new file mode 100644
index 000000000..51f8e514e
--- /dev/null
+++ b/src/gpu-compute/X86GPUTLB.py
@@ -0,0 +1,77 @@
+#
+#  Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Lisa Hsu
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.MemObject import MemObject
+
+if buildEnv['FULL_SYSTEM']:
+    class X86PagetableWalker(MemObject):
+        type = 'X86PagetableWalker'
+        cxx_class = 'X86ISA::Walker'
+        port = SlavePort("Port for the hardware table walker")
+        system = Param.System(Parent.any, "system object")
+
+class X86GPUTLB(MemObject):
+    type = 'X86GPUTLB'
+    cxx_class = 'X86ISA::GpuTLB'
+    cxx_header = 'gpu-compute/gpu_tlb.hh'
+    size = Param.Int(64, "TLB size (number of entries)")
+    assoc = Param.Int(64, "TLB associativity")
+
+    if buildEnv['FULL_SYSTEM']:
+        walker = Param.X86PagetableWalker(X86PagetableWalker(),
+                                          "page table walker")
+
+    hitLatency = Param.Int(2, "Latency of a TLB hit")
+    missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
+    missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
+    maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    allocationPolicy = Param.Bool(True, "Allocate on an access")
+    accessDistance = Param.Bool(False, "print accessDistance stats")
+
+class TLBCoalescer(MemObject):
+    type = 'TLBCoalescer'
+    cxx_class = 'TLBCoalescer'
+    cxx_header = 'gpu-compute/tlb_coalescer.hh'
+    probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
+    coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
+    slave = VectorSlavePort("Port on side closer to CPU/CU")
+    master = VectorMasterPort("Port on side closer to memory")
+    disableCoalescing = Param.Bool(False,"Dispable Coalescing")
diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc
new file mode 100644
index 000000000..7cc9b7cc4
--- /dev/null
+++ b/src/gpu-compute/brig_object.cc
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#include "gpu-compute/brig_object.hh"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+#include "arch/hsail/Brig.h"
+#include "base/misc.hh"
+#include "base/trace.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "debug/HSALoader.hh"
+
+using namespace Brig;
+
+std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
+    HsaObject::tryFileFuncs = { BrigObject::tryFile };
+
+extern int getBrigDataTypeBytes(BrigType16_t t);
+
+const char *BrigObject::sectionNames[] =
+{
+    "hsa_data",
+    "hsa_code",
+    "hsa_operand",
+    ".shstrtab"
+};
+
+const char *segmentNames[] =
+{
+    "none",
+    "flat",
+    "global",
+    "readonly",
+    "kernarg",
+    "group",
+    "private",
+    "spill",
+    "args"
+};
+
+const uint8_t*
+BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
+{
+    // allow offs == size for dummy end pointers
+    assert(offs <= sectionInfo[sec].size);
+
+    return sectionInfo[sec].ptr + offs;
+}
+
+const char*
+BrigObject::getString(int offs) const
+{
+    return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
+}
+
+const BrigBase*
+BrigObject::getCodeSectionEntry(int offs) const
+{
+    return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+const BrigData*
+BrigObject::getBrigBaseData(int offs) const
+{
+    return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
+}
+
+const uint8_t*
+BrigObject::getData(int offs) const
+{
+    return getSectionOffset(DataSectionIndex, offs);
+}
+
+const BrigOperand*
+BrigObject::getOperand(int offs) const
+{
+    return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
+}
+
+unsigned
+BrigObject::getOperandPtr(int offs, int index) const
+{
+    unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
+
+    return *op_offs;
+}
+
+const BrigInstBase*
+BrigObject::getInst(int offs) const
+{
+    return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+HsaCode*
+BrigObject::getKernel(const std::string &name) const
+{
+    return nullptr;
+}
+
+HsaCode*
+BrigObject::getFunction(const std::string &name) const
+{
+    for (int i = 0; i < functions.size(); ++i) {
+        if (functions[i]->name() == name) {
+            return functions[i];
+        }
+    }
+
+    return nullptr;
+}
+
+void
+BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
+                              StorageMap *storageMap)
+{
+    while (dirPtr < endPtr) {
+        if (!dirPtr->byteCount) {
+            fatal("Bad directive size 0\n");
+        }
+
+        // calculate next pointer now so we can override it if needed
+        const BrigBase *nextDirPtr = brigNext(dirPtr);
+
+        DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
+                dirPtr->kind, dirPtr->byteCount);
+
+        switch (dirPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_FUNCTION:
+            {
+                const BrigDirectiveExecutable *p M5_VAR_USED =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
+                        "%d next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                if (p->firstCodeBlockEntry != p->nextModuleEntry) {
+                    panic("Function calls are not fully supported yet!!: %s\n",
+                          getString(p->name));
+
+                    const char *name = getString(p->name);
+
+                    HsailCode *code_obj = nullptr;
+
+                    for (int i = 0; i < functions.size(); ++i) {
+                        if (functions[i]->name() == name) {
+                            code_obj = functions[i];
+                            break;
+                        }
+                    }
+
+                    if (!code_obj) {
+                        // create new local storage map for kernel-local symbols
+                        code_obj = new HsailCode(name, p, this,
+                                                 new StorageMap(storageMap));
+                        functions.push_back(code_obj);
+                    } else {
+                        panic("Multiple definition of Function!!: %s\n",
+                              getString(p->name));
+                    }
+
+                }
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_KERNEL:
+            {
+                const BrigDirectiveExecutable *p =
+                    reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+                DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
+                        "next: %d\n", getString(p->name),
+                        p->firstCodeBlockEntry, p->nextModuleEntry);
+
+                const char *name = getString(p->name);
+
+                if (name[0] == '&')
+                    name++;
+
+                std::string str = name;
+                char *temp;
+                int len = str.length();
+
+                if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
+                    temp = new char[str.size() + 1];
+                    std::copy(str.begin(), str.end() , temp);
+                    temp[str.size()] = '\0';
+                } else {
+                    temp = new char[str.size()];
+                    std::copy(str.begin(), str.end() - 1 , temp);
+                    temp[str.size() - 1 ] = '\0';
+                }
+
+                std::string kernel_name = temp;
+                delete[] temp;
+
+                HsailCode *code_obj = nullptr;
+
+                for (const auto &kernel : kernels) {
+                    if (kernel->name() == kernel_name) {
+                        code_obj = kernel;
+                        break;
+                    }
+                }
+
+                if (!code_obj) {
+                    // create new local storage map for kernel-local symbols
+                    code_obj = new HsailCode(kernel_name, p, this,
+                                             new StorageMap(storageMap));
+
+                    kernels.push_back(code_obj);
+                }
+
+                nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+            {
+                const BrigDirectiveVariable *p =
+                    reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
+
+                uint64_t readonlySize_old =
+                    storageMap->getSize(BRIG_SEGMENT_READONLY);
+
+                StorageElement* se = storageMap->addSymbol(p, this);
+
+                DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
+                        getString(p->name));
+
+                if (p->segment == BRIG_SEGMENT_READONLY) {
+                    // readonly memory has initialization data
+                    uint8_t* readonlyData_old = readonlyData;
+
+                    readonlyData =
+                        new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
+
+                    if (p->init) {
+                        if ((p->type == BRIG_TYPE_ROIMG) ||
+                            (p->type == BRIG_TYPE_WOIMG) ||
+                            (p->type == BRIG_TYPE_SAMP) ||
+                            (p->type == BRIG_TYPE_SIG32) ||
+                            (p->type == BRIG_TYPE_SIG64)) {
+                            panic("Read only data type not supported: %s\n",
+                                  getString(p->name));
+                        }
+
+                        const BrigOperand *brigOp = getOperand(p->init);
+                        assert(brigOp->kind ==
+                               BRIG_KIND_OPERAND_CONSTANT_BYTES);
+
+                        const Brig::BrigData *operand_data M5_VAR_USED =
+                            getBrigBaseData(((BrigOperandConstantBytes*)
+                                            brigOp)->bytes);
+
+                        assert((operand_data->byteCount / 4) > 0);
+
+                        uint8_t *symbol_data =
+                            (uint8_t*)getData(((BrigOperandConstantBytes*)
+                                              brigOp)->bytes + 4);
+
+                        // copy the old data and add the new data
+                        if (readonlySize_old > 0) {
+                            memcpy(readonlyData, readonlyData_old,
+                                   readonlySize_old);
+                        }
+
+                        memcpy(readonlyData + se->offset, symbol_data,
+                               se->size);
+
+                        delete[] readonlyData_old;
+                   }
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+              const BrigDirectiveLabel M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
+
+              panic("Label directives cannot be at the module level: %s\n",
+                    getString(p->name));
+
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+              const BrigDirectiveComment M5_VAR_USED *p =
+                  reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
+
+              DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
+                      getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LOC:
+            {
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_MODULE:
+            {
+                const BrigDirectiveModule M5_VAR_USED *p =
+                    reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
+
+                DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
+                        getString(p->name));
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_CONTROL:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_EXTENSION:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
+            }
+            break;
+          default:
+            if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_INST_END)
+                break;
+
+            if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                dirPtr->kind <= BRIG_KIND_OPERAND_END)
+                break;
+
+            warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
+            break;
+        }
+
+        dirPtr = nextDirPtr;
+    }
+}
+
+HsaObject*
+BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
+{
+    const char *brig_ident = "HSA BRIG";
+
+    if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
+        return nullptr;
+
+    return new BrigObject(fname, len, fileData);
+}
+
+BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
+    : HsaObject(fname), storageMap(new StorageMap())
+{
+    const char *brig_ident = "HSA BRIG";
+    BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
+
+    fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
+             "%s is not a BRIG file\n", fname);
+
+    if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
+        mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
+        fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
+              fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
+              BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
+    }
+
+    fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
+             "count (%d) != expected value (%d)\n", fname,
+             mod_hdr->sectionCount, NumSectionIndices);
+
+    for (int i = 0; i < NumSectionIndices; ++i) {
+        sectionInfo[i].ptr = nullptr;
+    }
+
+    uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
+    for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
+        uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
+        BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
+
+        // It doesn't look like cprintf supports string precision values,
+        // but if this breaks, the right answer is to fix that
+        DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
+                sec_hdr->name);
+
+        sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
+        memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
+        sectionInfo[sec_idx].size = sec_hdr->byteCount;
+    }
+
+    BrigSectionHeader *code_hdr =
+        (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
+
+    DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
+            "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
+            code_hdr->nameLength);
+
+    // start at offset 4 to skip initial null entry (see Brig spec)
+    processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
+                      getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
+                      storageMap);
+
+    delete[] fileData;
+
+    DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
+}
+
+BrigObject::~BrigObject()
+{
+    for (int i = 0; i < NumSectionIndices; ++i)
+        if (sectionInfo[i].ptr)
+            delete[] sectionInfo[i].ptr;
+}
diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh
new file mode 100644
index 000000000..59a585914
--- /dev/null
+++ b/src/gpu-compute/brig_object.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#ifndef __BRIG_OBJECT_HH__
+#define __BRIG_OBJECT_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/hsail/Brig.h"
+#include "gpu-compute/hsa_object.hh"
+#include "gpu-compute/hsail_code.hh"
+
+class LabelMap;
+class StorageMap;
+
+/* @class BrigObject
+ * this class implements the BRIG loader object, and
+ * is used when the simulator directly executes HSAIL.
+ * this class is responsible for extracting all
+ * information about kernels contained in BRIG format
+ * and converts them to HsailCode objects that are
+ * usable by the simulator and emulated runtime.
+ */
+
+class BrigObject final : public HsaObject
+{
+  public:
+    enum SectionIndex
+    {
+        DataSectionIndex,
+        CodeSectionIndex,
+        OperandsSectionIndex,
+        NumSectionIndices
+    };
+
+    static const char *sectionNames[];
+
+    struct SectionInfo
+    {
+        uint8_t *ptr;
+        int size;
+    };
+
+    static HsaObject* tryFile(const std::string &fname, int len,
+                              uint8_t *fileData);
+
+    SectionInfo sectionInfo[NumSectionIndices];
+    const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
+
+    std::vector<HsailCode*> kernels;
+    std::vector<HsailCode*> functions;
+    std::string kern_block_name;
+
+    void processDirectives(const Brig::BrigBase *dirPtr,
+                           const Brig::BrigBase *endPtr,
+                           StorageMap *storageMap);
+
+    BrigObject(const std::string &fname, int len, uint8_t *fileData);
+    ~BrigObject();
+
+    // eventually these will need to be per-kernel not per-object-file
+    StorageMap *storageMap;
+    LabelMap *labelMap;
+
+    const char* getString(int offs) const;
+    const Brig::BrigData* getBrigBaseData(int offs) const;
+    const uint8_t* getData(int offs) const;
+    const Brig::BrigBase* getCodeSectionEntry(int offs) const;
+    const Brig::BrigOperand* getOperand(int offs) const;
+    unsigned getOperandPtr(int offs, int index) const;
+    const Brig::BrigInstBase* getInst(int offs) const;
+
+    HsaCode* getKernel(const std::string &name) const override;
+    HsaCode* getFunction(const std::string &name) const override;
+
+    int numKernels() const override { return kernels.size(); }
+
+    HsaCode* getKernel(int i) const override { return kernels[i]; }
+
+    // pointer to the current kernel/function we're processing, so elements
+    // under construction can reference it.  kinda ugly, but easier
+    // than passing it all over for the few places it's needed.
+    mutable HsailCode *currentCode;
+};
+
+// Utility function to bump Brig item pointer to next element given
+// item size in bytes.  Really just an add but with lots of casting.
+template<typename T>
+T*
+brigNext(T *ptr)
+{
+    Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
+    int size = base_ptr->byteCount;
+    assert(size);
+
+    return (T*)((uint8_t*)ptr + size);
+}
+
+#endif // __BRIG_OBJECT_HH__
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
new file mode 100644
index 000000000..3b3291c03
--- /dev/null
+++ b/src/gpu-compute/cl_driver.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/cl_driver.hh"
+
+#include "base/intmath.hh"
+#include "cpu/thread_context.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/hsa_object.hh"
+#include "params/ClDriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+ClDriver::ClDriver(ClDriverParams *p)
+    : EmulatedDriver(p), hsaCode(0)
+{
+    for (const auto &codeFile : p->codefile)
+        codeFiles.push_back(&codeFile);
+
+    maxFuncArgsSize = 0;
+
+    for (int i = 0; i < codeFiles.size(); ++i) {
+        HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
+
+        for (int k = 0; k < obj->numKernels(); ++k) {
+            assert(obj->getKernel(k));
+            kernels.push_back(obj->getKernel(k));
+            kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
+            int kern_funcargs_size = kernels.back()->funcarg_size;
+            maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
+                kern_funcargs_size : maxFuncArgsSize;
+        }
+    }
+
+    int name_offs = 0;
+    int code_offs = 0;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        kernelInfo.push_back(HsaKernelInfo());
+        HsaCode *k = kernels[i];
+
+        k->generateHsaKernelInfo(&kernelInfo[i]);
+
+        kernelInfo[i].name_offs = name_offs;
+        kernelInfo[i].code_offs = code_offs;
+
+        name_offs += k->name().size() + 1;
+        code_offs += k->numInsts() * sizeof(GPUStaticInst*);
+    }
+}
+
+void
+ClDriver::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+    dispatcher->setFuncargsSize(maxFuncArgsSize);
+}
+
+int
+ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
+{
+    int fd = p->allocFD(-1, filename, 0, 0, false);
+    FDEntry *fde = p->getFDEntry(fd);
+    fde->driver = this;
+
+    return fd;
+}
+
+int
+ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
+{
+    int index = 2;
+    Addr buf_addr = process->getSyscallArg(tc, index);
+
+    switch (req) {
+      case HSA_GET_SIZES:
+        {
+            TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
+            sizes->num_kernels = kernels.size();
+            sizes->string_table_size = 0;
+            sizes->code_size = 0;
+            sizes->readonly_size = 0;
+
+            if (kernels.size() > 0) {
+                // all kernels will share the same read-only memory
+                sizes->readonly_size =
+                    kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
+                // check our assumption
+                for (int i = 1; i<kernels.size(); ++i) {
+                    assert(sizes->readonly_size ==
+                    kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
+                }
+            }
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                // add one for terminating '\0'
+                sizes->string_table_size += k->name().size() + 1;
+                sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
+            }
+
+            sizes.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_KINFO:
+        {
+            TypedBufferArg<HsaKernelInfo>
+                kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaKernelInfo *ki = &kinfo[i];
+                ki->name_offs = kernelInfo[i].name_offs;
+                ki->code_offs = kernelInfo[i].code_offs;
+                ki->sRegCount = kernelInfo[i].sRegCount;
+                ki->dRegCount = kernelInfo[i].dRegCount;
+                ki->cRegCount = kernelInfo[i].cRegCount;
+                ki->static_lds_size  = kernelInfo[i].static_lds_size;
+                ki->private_mem_size = kernelInfo[i].private_mem_size;
+                ki->spill_mem_size   = kernelInfo[i].spill_mem_size;
+            }
+
+            kinfo.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_STRINGS:
+        {
+            int string_table_size = 0;
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                string_table_size += k->name().size() + 1;
+            }
+
+            BufferArg buf(buf_addr, string_table_size);
+            char *bufp = (char*)buf.bufferPtr();
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                const char *n = k->name().c_str();
+
+                // idiomatic string copy
+                while ((*bufp++ = *n++));
+            }
+
+            assert(bufp - (char *)buf.bufferPtr() == string_table_size);
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_READONLY_DATA:
+        {
+            // we can pick any kernel --- they share the same
+            // readonly segment (this assumption is checked in GET_SIZES)
+            uint64_t size =
+                kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
+            BufferArg data(buf_addr, size);
+            char *datap = (char *)data.bufferPtr();
+            memcpy(datap,
+                   kernels.back()->readonly_data,
+                   size);
+            data.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CODE:
+        {
+            // set hsaCode pointer
+            hsaCode = buf_addr;
+            int code_size = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+                code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
+            }
+
+            TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
+            TheGpuISA::RawMachInst *bufp = buf;
+
+            int buf_idx = 0;
+
+            for (int i = 0; i < kernels.size(); ++i) {
+                HsaCode *k = kernels[i];
+
+                for (int j = 0; j < k->numInsts(); ++j) {
+                    bufp[buf_idx] = k->insts()->at(j);
+                    ++buf_idx;
+                }
+            }
+
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_CU_CNT:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      case HSA_GET_VSZ:
+        {
+            BufferArg buf(buf_addr, sizeof(uint32_t));
+            *((uint32_t*)buf.bufferPtr()) = VSZ;
+            buf.copyOut(tc->getMemProxy());
+        }
+        break;
+
+      default:
+        fatal("ClDriver: bad ioctl %d\n", req);
+    }
+
+    return 0;
+}
+
+const char*
+ClDriver::codeOffToKernelName(uint64_t code_ptr)
+{
+    assert(hsaCode);
+    uint32_t code_offs = code_ptr - hsaCode;
+
+    for (int i = 0; i < kernels.size(); ++i) {
+        if (code_offs == kernelInfo[i].code_offs) {
+            return kernels[i]->name().c_str();
+        }
+    }
+
+    return nullptr;
+}
+
+ClDriver*
+ClDriverParams::create()
+{
+    return new ClDriver(this);
+}
diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh
new file mode 100644
index 000000000..03567bab5
--- /dev/null
+++ b/src/gpu-compute/cl_driver.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CL_DRIVER_HH__
+#define __CL_DRIVER_HH__
+
+#include <vector>
+
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "sim/emul_driver.hh"
+
+class GpuDispatcher;
+class HsaCode;
+class LiveProcess;
+class ThreadContext;
+
+struct ClDriverParams;
+
+class ClDriver final : public EmulatedDriver
+{
+  public:
+    ClDriver(ClDriverParams *p);
+    void handshake(GpuDispatcher *_dispatcher);
+    int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
+    int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
+    const char* codeOffToKernelName(uint64_t code_ptr);
+
+  private:
+    GpuDispatcher *dispatcher;
+
+    std::vector<const std::string*> codeFiles;
+
+    // All the kernels we know about
+    std::vector<HsaCode*> kernels;
+    std::vector<HsaCode*> functions;
+
+    std::vector<HsaKernelInfo> kernelInfo;
+
+    // maximum size necessary for function arguments
+    int maxFuncArgsSize;
+    // The host virtual address for the kernel code
+    uint64_t hsaCode;
+};
+
+#endif // __CL_DRIVER_HH__
diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh
new file mode 100644
index 000000000..75297a2d2
--- /dev/null
+++ b/src/gpu-compute/cl_event.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Marc Orr
+ */
+
+#ifndef __GPU_CL_EVENT_HH__
+#define __GPU_CL_EVENT_HH__
+
+struct HsaQueueEntry;
+
+class _cl_event {
+  public:
+    _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
+
+    volatile bool done;
+    HsaQueueEntry *hsaTaskPtr;
+    uint64_t start;
+    uint64_t end;
+};
+
+#endif // __GPU_CL_EVENT_HH__
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
new file mode 100644
index 000000000..126cf6c50
--- /dev/null
+++ b/src/gpu-compute/code_enums.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CODE_ENUMS_HH__
+#define __CODE_ENUMS_HH__
+
+#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
+                    && (a)<=Enums::OT_GLOBAL_LDAS)
+#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
+                    && (a)<=Enums::OT_SHARED_LDAS)
+#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
+                    && (a)<=Enums::OT_PRIVATE_LDAS)
+#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
+                    && (a)<=Enums::OT_SPILL_LDAS)
+#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
+                    && (a)<=Enums::OT_READONLY_LDAS)
+#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
+
+#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
+                    ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
+                    ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
+
+#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
+                    ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
+                    ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
+
+#define IS_OT_READ_GM(a) \
+    ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
+    ||(a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
+
+#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
+
+#define IS_OT_WRITE(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
+    ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
+
+#define IS_OT_WRITE_GM(a) \
+    ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
+    ||(a)==Enums::OT_READONLY_WRITE)
+
+#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
+
+#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
+
+#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_PRIVATE_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_FLAT_ATOMIC)
+
+#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+                    ||(a)==Enums::OT_SPILL_ATOMIC \
+                    ||(a)==Enums::OT_READONLY_ATOMIC \
+                    ||(a)==Enums::OT_GLOBAL_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
+                    ||(a)==Enums::OT_SHARED_MEMFENCE \
+                    ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
+
+#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SHARED_HIST \
+                    ||(a)==Enums::OT_PRIVATE_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST \
+                    ||(a)==Enums::OT_FLAT_HIST)
+
+#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
+                    ||(a)==Enums::OT_SPILL_HIST \
+                    ||(a)==Enums::OT_READONLY_HIST)
+
+#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
+
+#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
+
+#endif // __CODE_ENUMS_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
new file mode 100644
index 000000000..d3622007a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.cc
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#include "gpu-compute/compute_unit.hh"
+
+#include "base/output.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUFetch.hh"
+#include "debug/GPUMem.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUSync.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
+    scoreboardCheckStage(p), scheduleStage(p), execStage(p),
+    globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
+    cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+    spBypassPipeLength(p->spbypass_pipe_length),
+    dpBypassPipeLength(p->dpbypass_pipe_length),
+    issuePeriod(p->issue_period),
+    numGlbMemUnits(p->num_global_mem_pipes),
+    numLocMemUnits(p->num_shared_mem_pipes),
+    perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
+    prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
+    xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+    functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
+    countPages(p->countPages), barrier_id(0),
+    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+    req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
+    resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
+    _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
+    lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
+{
+    // this check will be eliminated once we have wavefront size support added
+    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+    // calculate how many cycles a vector load or store will need to transfer
+    // its data over the corresponding buses
+    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+                                / (double)vrfToCoalescerBusWidth);
+
+    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+                               / coalescerToVrfBusWidth;
+
+    lastVaddrWF.resize(numSIMDs);
+    wfList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        lastVaddrWF[j].resize(p->n_wf);
+
+        for (int i = 0; i < p->n_wf; ++i) {
+            lastVaddrWF[j][i].resize(VSZ);
+
+            wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
+            wfList[j][i]->setParent(this);
+
+            for (int k = 0; k < VSZ; ++k) {
+                lastVaddrWF[j][i][k] = 0;
+            }
+        }
+    }
+
+    lastVaddrPhase.resize(numSIMDs);
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        lastVaddrPhase[i] = LastVaddrWave();
+    }
+
+    lastVaddrCU = LastVaddrWave();
+
+    lds.setParent(this);
+
+    if (p->execPolicy == "OLDEST-FIRST") {
+        exec_policy = EXEC_POLICY::OLDEST;
+    } else if (p->execPolicy == "ROUND-ROBIN") {
+        exec_policy = EXEC_POLICY::RR;
+    } else {
+        fatal("Invalid WF execution policy (CU)\n");
+    }
+
+    memPort.resize(VSZ);
+
+    // resize the tlbPort vectorArray
+    int tlbPort_width = perLaneTLB ? VSZ : 1;
+    tlbPort.resize(tlbPort_width);
+
+    cuExitCallback = new CUExitCallback(this);
+    registerExitCallback(cuExitCallback);
+
+    xactCasLoadMap.clear();
+    lastExecCycle.resize(numSIMDs, 0);
+
+    for (int i = 0; i < vrf.size(); ++i) {
+        vrf[i]->setParent(this);
+    }
+
+    numVecRegsPerSimd = vrf[0]->numRegs();
+}
+
+ComputeUnit::~ComputeUnit()
+{
+    // Delete wavefront slots
+
+    for (int j = 0; j < numSIMDs; ++j)
+        for (int i = 0; i < shader->n_wf; ++i) {
+            delete wfList[j][i];
+        }
+
+    readyList.clear();
+    waveStatusList.clear();
+    dispatchList.clear();
+    vectorAluInstAvail.clear();
+    delete cuExitCallback;
+    delete ldsPort;
+}
+
+void
+ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+{
+    w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+
+    w->workgroupsz[0] = ndr->q.wgSize[0];
+    w->workgroupsz[1] = ndr->q.wgSize[1];
+    w->workgroupsz[2] = ndr->q.wgSize[2];
+    w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
+    w->gridsz[0] = ndr->q.gdSize[0];
+    w->gridsz[1] = ndr->q.gdSize[1];
+    w->gridsz[2] = ndr->q.gdSize[2];
+    w->kernelArgs = ndr->q.args;
+    w->privSizePerItem = ndr->q.privMemPerItem;
+    w->spillSizePerItem = ndr->q.spillMemPerItem;
+    w->roBase = ndr->q.roMemStart;
+    w->roSize = ndr->q.roMemTotal;
+}
+
+void
+ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                        int trueWgSize[], int trueWgSizeTotal,
+                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
+{
+    wfCtx->cnt = cnt;
+
+    VectorMask init_mask;
+    init_mask.reset();
+
+    for (int k = 0; k < VSZ; ++k) {
+        if (k + cnt * VSZ < trueWgSizeTotal)
+            init_mask[k] = 1;
+    }
+
+    wfCtx->init_mask = init_mask.to_ullong();
+    wfCtx->exec_mask = init_mask.to_ullong();
+
+    for (int i = 0; i < VSZ; ++i) {
+        wfCtx->bar_cnt[i] = 0;
+    }
+
+    wfCtx->max_bar_cnt = 0;
+    wfCtx->old_barrier_cnt = 0;
+    wfCtx->barrier_cnt = 0;
+
+    wfCtx->privBase = ndr->q.privMemStart;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+
+    wfCtx->spillBase = ndr->q.spillMemStart;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+
+    wfCtx->pc = 0;
+    wfCtx->rpc = UINT32_MAX;
+
+    // set the wavefront context to have a pointer to this section of the LDS
+    wfCtx->ldsChunk = ldsChunk;
+
+    // WG state
+    wfCtx->wg_id = ndr->globalWgId;
+    wfCtx->barrier_id = barrier_id;
+
+    // Kernel wide state
+    wfCtx->ndr = ndr;
+}
+
+void
+ComputeUnit::updateEvents() {
+
+    if (!timestampVec.empty()) {
+        uint32_t vecSize = timestampVec.size();
+        uint32_t i = 0;
+        while (i < vecSize) {
+            if (timestampVec[i] <= shader->tick_cnt) {
+                std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
+                vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
+                                            statusVec[i]);
+                timestampVec.erase(timestampVec.begin() + i);
+                regIdxVec.erase(regIdxVec.begin() + i);
+                statusVec.erase(statusVec.begin() + i);
+                --vecSize;
+                --i;
+            }
+            ++i;
+        }
+    }
+
+    for (int i = 0; i< numSIMDs; ++i) {
+        vrf[i]->updateEvents();
+    }
+}
+
+
+void
+ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                     int trueWgSizeTotal)
+{
+    static int _n_wave = 0;
+    int cnt = wfCtx->cnt;
+    NDRange *ndr = wfCtx->ndr;
+
+    // Fill in Kernel state
+    FillKernelState(w, ndr);
+
+    w->kern_id = ndr->dispatchId;
+    w->dynwaveid = cnt;
+    w->init_mask = wfCtx->init_mask;
+
+    for (int k = 0; k < VSZ; ++k) {
+        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
+        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
+        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+
+        w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
+            trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
+            w->workitemid[0][k];
+    }
+
+    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
+    w->barrier_cnt = wfCtx->barrier_cnt;
+    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+
+    for (int i = 0; i < VSZ; ++i) {
+        w->bar_cnt[i] = wfCtx->bar_cnt[i];
+    }
+
+    w->max_bar_cnt = wfCtx->max_bar_cnt;
+    w->privBase = wfCtx->privBase;
+    w->spillBase = wfCtx->spillBase;
+
+    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+
+    // WG state
+    w->wg_id = wfCtx->wg_id;
+    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->workgroupid[0] = w->wg_id % ndr->numWg[0];
+    w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
+    w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
+
+    w->barrier_id = wfCtx->barrier_id;
+    w->stalledAtBarrier = false;
+
+    // move this from the context into the actual wavefront
+    w->ldsChunk = wfCtx->ldsChunk;
+
+    int32_t refCount M5_VAR_USED =
+                    lds.increaseRefCounter(w->dispatchid, w->wg_id);
+    DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
+                    cu_id, w->wg_id, refCount);
+
+    w->instructionBuffer.clear();
+
+    if (w->pendingFetch)
+        w->dropFetch = true;
+
+    // is this the last wavefront in the workgroup
+    // if set the spillWidth to be the remaining work-items
+    // so that the vector access is correct
+    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
+        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+    } else {
+        w->spillWidth = VSZ;
+    }
+
+    DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
+            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+
+    w->start(++_n_wave, ndr->q.code_ptr);
+}
+
+void
+ComputeUnit::StartWorkgroup(NDRange *ndr)
+{
+    // reserve the LDS capacity allocated to the work group
+    // disambiguated by the dispatch ID and workgroup ID, which should be
+    // globally unique
+    LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
+                                          ndr->q.ldsSize);
+
+    // Send L1 cache acquire
+    // isKernel + isAcquire = Kernel Begin
+    if (shader->impl_kern_boundary_sync) {
+        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+                                                                nullptr,
+                                                                nullptr, 0);
+
+        gpuDynInst->useContinuation = false;
+        gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
+        gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
+        injectGlobalMemFence(gpuDynInst, true);
+    }
+
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+    }
+
+    uint64_t origSpillMemStart = ndr->q.spillMemStart;
+    // calculate the number of 32-bit vector registers required by wavefront
+    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    int cnt = 0;
+
+    // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
+    for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
+        Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
+        // Check if this wavefront slot is available:
+        // It must be stopped and not waiting
+        // for a release to complete S_RETURNING
+        if (w->status == Wavefront::S_STOPPED) {
+            // if we have scheduled all work items then stop
+            // scheduling wavefronts
+            if (cnt * VSZ >= trueWgSizeTotal)
+                break;
+
+            // reserve vector registers for the scheduled wavefront
+            assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
+            uint32_t normSize = 0;
+
+            w->startVgprIndex = vrf[m % numSIMDs]->manager->
+                                    allocateRegion(vregDemand, &normSize);
+
+            w->reservedVectorRegs = normSize;
+            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+
+            WFContext wfCtx;
+
+            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
+                                ldsChunk, origSpillMemStart);
+
+            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+            ++cnt;
+        }
+    }
+    ++barrier_id;
+}
+
+int
+ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+{
+    // Get true size of workgroup (after clamping to grid size)
+    int trueWgSize[3];
+    int trueWgSizeTotal = 1;
+
+    for (int d = 0; d < 3; ++d) {
+        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+                                 ndr->wgId[d] * ndr->q.wgSize[d]);
+
+        trueWgSizeTotal *= trueWgSize[d];
+        DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
+    }
+
+    DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
+
+    // calculate the number of 32-bit vector registers required by each
+    // work item of the work group
+    int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+    bool vregAvail = true;
+    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+    int freeWfSlots = 0;
+    // check if the total number of VGPRs required by all WFs of the WG
+    // fit in the VRFs of all SIMD units
+    assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+    int numMappedWfs = 0;
+    std::vector<int> numWfsPerSimd;
+    numWfsPerSimd.resize(numSIMDs, 0);
+    // find how many free WF slots we have across all SIMDs
+    for (int j = 0; j < shader->n_wf; ++j) {
+        for (int i = 0; i < numSIMDs; ++i) {
+            if (wfList[i][j]->status == Wavefront::S_STOPPED) {
+                // count the number of free WF slots
+                ++freeWfSlots;
+                if (numMappedWfs < numWfs) {
+                    // count the WFs to be assigned per SIMD
+                    numWfsPerSimd[i]++;
+                }
+                numMappedWfs++;
+            }
+        }
+    }
+
+    // if there are enough free WF slots then find if there are enough
+    // free VGPRs per SIMD based on the WF->SIMD mapping
+    if (freeWfSlots >= numWfs) {
+        for (int j = 0; j < numSIMDs; ++j) {
+            // find if there are enough free VGPR regions in the SIMD's VRF
+            // to accommodate the WFs of the new WG that would be mapped to
+            // this SIMD unit
+            vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
+                                                     vregDemandPerWI);
+
+            // stop searching if there is at least one SIMD
+            // whose VRF does not have enough free VGPR pools.
+            // This is because a WG is scheduled only if ALL
+            // of its WFs can be scheduled
+            if (!vregAvail)
+                break;
+        }
+    }
+
+    DPRINTF(GPUDisp, "Free WF slots =  %d, VGPR Availability = %d\n",
+            freeWfSlots, vregAvail);
+
+    if (!vregAvail) {
+        ++numTimesWgBlockedDueVgprAlloc;
+    }
+
+    // Return true if enough WF slots to submit workgroup and if there are
+    // enough VGPRs to schedule all WFs to their SIMD units
+    if (!lds.canReserve(ndr->q.ldsSize)) {
+        wgBlockedDueLdsAllocation++;
+    }
+
+    // Return true if (a) there are enough free WF slots to submit
+    // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
+    // SIMD units and (c) if there is enough space in LDS
+    return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+}
+
+int
+ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+{
+    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
+    int ccnt = 0;
+
+    for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
+            Wavefront *w = wfList[i_simd][i_wf];
+
+            if (w->status == Wavefront::S_RUNNING) {
+                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+
+                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
+                        w->barrier_id, _barrier_id);
+
+                DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+                        w->barrier_cnt, bcnt);
+            }
+
+            if (w->status == Wavefront::S_RUNNING &&
+                w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
+                !w->outstanding_reqs) {
+                ++ccnt;
+
+                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
+                        "%d\n", i_simd, i_wf, ccnt);
+            }
+        }
+    }
+
+    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
+            cu_id, ccnt, bslots);
+
+    return ccnt == bslots;
+}
+
+//  Check if the current wavefront is blocked on additional resources.
+bool
+ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
+{
+    bool cede = false;
+
+    // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
+    // magic instructions will impact the scheduling of wavefronts
+    if (xact_cas_mode) {
+        /*
+         * When a wavefront calls xact_cas_ld, it adds itself to a per address
+         * queue. All per address queues are managed by the xactCasLoadMap.
+         *
+         * A wavefront is not blocked if: it is not in ANY per address queue or
+         * if it is at the head of a per address queue.
+         */
+        for (auto itMap : xactCasLoadMap) {
+            std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
+
+            if (!curWaveIDQueue.empty()) {
+                for (auto it : curWaveIDQueue) {
+                    waveIdentifier cur_wave = it;
+
+                    if (cur_wave.simdId == simdId &&
+                        cur_wave.wfSlotId == wfSlotId) {
+                        // 2 possibilities
+                        // 1: this WF has a green light
+                        // 2: another WF has a green light
+                        waveIdentifier owner_wave = curWaveIDQueue.front();
+
+                        if (owner_wave.simdId != cur_wave.simdId ||
+                            owner_wave.wfSlotId != cur_wave.wfSlotId) {
+                            // possibility 2
+                            cede = true;
+                            break;
+                        } else {
+                            // possibility 1
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return cede;
+}
+
+// Execute one clock worth of work on the ComputeUnit.
+void
+ComputeUnit::exec()
+{
+    updateEvents();
+    // Execute pipeline stages in reverse order to simulate
+    // the pipeline latency
+    globalMemoryPipe.exec();
+    localMemoryPipe.exec();
+    execStage.exec();
+    scheduleStage.exec();
+    scoreboardCheckStage.exec();
+    fetchStage.exec();
+
+    totalCycles++;
+}
+
+void
+ComputeUnit::init()
+{
+    // Initialize CU Bus models
+    glbMemToVrfBus.init(&shader->tick_cnt, 1);
+    locMemToVrfBus.init(&shader->tick_cnt, 1);
+    nextGlbMemBus = 0;
+    nextLocMemBus = 0;
+    fatal_if(numGlbMemUnits > 1,
+             "No support for multiple Global Memory Pipelines exists!!!");
+    vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        vrfToGlobalMemPipeBus[j] = WaitClass();
+        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+
+    fatal_if(numLocMemUnits > 1,
+             "No support for multiple Local Memory Pipelines exists!!!");
+    vrfToLocalMemPipeBus.resize(numLocMemUnits);
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        vrfToLocalMemPipeBus[j] = WaitClass();
+        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+    }
+    vectorRegsReserved.resize(numSIMDs, 0);
+    aluPipe.resize(numSIMDs);
+    wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
+
+    for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
+        wfWait[i] = WaitClass();
+        wfWait[i].init(&shader->tick_cnt, 1);
+    }
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        aluPipe[i] = WaitClass();
+        aluPipe[i].init(&shader->tick_cnt, 1);
+    }
+
+    // Setup space for call args
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            wfList[j][i]->initCallArgMem(shader->funcargs_size);
+        }
+    }
+
+    // Initializing pipeline resources
+    readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
+    waveStatusList.resize(numSIMDs);
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        for (int i = 0; i < shader->n_wf; ++i) {
+            waveStatusList[j].push_back(
+                std::make_pair(wfList[j][i], BLOCKED));
+        }
+    }
+
+    for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
+        dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+    }
+
+    fetchStage.init(this);
+    scoreboardCheckStage.init(this);
+    scheduleStage.init(this);
+    execStage.init(this);
+    globalMemoryPipe.init(this);
+    localMemoryPipe.init(this);
+    // initialize state for statistics calculation
+    vectorAluInstAvail.resize(numSIMDs, false);
+    shrMemInstAvail = 0;
+    glbMemInstAvail = 0;
+}
+
+bool
+ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+    // Ruby has completed the memory op. Schedule the mem_resp_event at the
+    // appropriate cycle to process the timing memory response
+    // This delay represents the pipeline delay
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    int index = sender_state->port_index;
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+    // Is the packet returned a Kernel End or Barrier
+    if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        Wavefront *w =
+            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+
+        // Check if we are waiting on Kernel End Release
+        if (w->status == Wavefront::S_RETURNING) {
+            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+                    computeUnit->cu_id, w->simdId, w->wfSlotId,
+                    w->wfDynId, w->kern_id);
+
+            computeUnit->shader->dispatcher->notifyWgCompl(w);
+            w->status = Wavefront::S_STOPPED;
+        } else {
+            w->outstanding_reqs--;
+        }
+
+        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, w->barrier_cnt);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                           gpuDynInst);
+        }
+
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+        return true;
+    }
+
+    ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
+        new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
+                                                pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+            computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            index, pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_resp_event,
+                          curTick() + computeUnit->resp_tick_latency);
+    return true;
+}
+
+void
+ComputeUnit::DataPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
+                computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                pkt->req->getPaddr());
+
+        /** Currently Ruby can return false due to conflicts for the particular
+         *  cache block or address.  Thus other requests should be allowed to
+         *  pass and the data port should expect multiple retries. */
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUMem, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUMem, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
+{
+    computeUnit->fetchStage.processFetchReturn(pkt);
+
+    return true;
+}
+
+void
+ComputeUnit::SQCPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    assert(len > 0);
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front().first;
+        Wavefront *wavefront M5_VAR_USED = retries.front().second;
+        DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
+                computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                pkt->req->getPaddr());
+        if (!sendTimingReq(pkt)) {
+            DPRINTF(GPUFetch, "failed again!\n");
+            break;
+        } else {
+            DPRINTF(GPUFetch, "successful!\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    // There must be a way around this check to do the globalMemStart...
+    Addr tmp_vaddr = pkt->req->getVaddr();
+
+    updatePageDivergenceDist(tmp_vaddr);
+
+    pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
+                      pkt->req->getFlags(), pkt->req->masterId(),
+                      pkt->req->getPC());
+
+    // figure out the type of the request to set read/write
+    BaseTLB::Mode TLB_mode;
+    assert(pkt->isRead() || pkt->isWrite());
+
+    // Check write before read for atomic operations
+    // since atomic operations should use BaseTLB::Write
+    if (pkt->isWrite()){
+        TLB_mode = BaseTLB::Write;
+    } else if (pkt->isRead()) {
+        TLB_mode = BaseTLB::Read;
+    } else {
+        fatal("pkt is not a read nor a write\n");
+    }
+
+    tlbCycles -= curTick();
+    ++tlbRequests;
+
+    int tlbPort_index = perLaneTLB ? index : 0;
+
+    if (shader->timingSim) {
+        if (debugSegFault) {
+            Process *p = shader->gpuTc->getProcessPtr();
+            Addr vaddr = pkt->req->getVaddr();
+            unsigned size = pkt->getSize();
+
+            if ((vaddr + size - 1) % 64 < vaddr % 64) {
+                panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
+                      cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
+            }
+
+            Addr paddr;
+
+            if (!p->pTable->translate(vaddr, paddr)) {
+                if (!p->fixupStackFault(vaddr)) {
+                    panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
+                          cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                          vaddr);
+                }
+            }
+        }
+
+        // This is the SenderState needed upon return
+        pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
+
+        // This is the senderState needed by the TLB hierarchy to function
+        TheISA::GpuTLB::TranslationState *translation_state =
+          new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+                                               pkt->senderState);
+
+        pkt->senderState = translation_state;
+
+        if (functionalTLB) {
+            tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+            // update the hitLevel distribution
+            int hit_level = translation_state->hitLevel;
+            assert(hit_level != -1);
+            hitsPerTLBLevel[hit_level]++;
+
+            // New SenderState for the memory access
+            X86ISA::GpuTLB::TranslationState *sender_state =
+                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+            delete sender_state->tlbEntry;
+            delete sender_state->saved;
+            delete sender_state;
+
+            assert(pkt->req->hasPaddr());
+            assert(pkt->req->hasSize());
+
+            uint8_t *tmpData = pkt->getPtr<uint8_t>();
+
+            // this is necessary because the GPU TLB receives packets instead
+            // of requests. when the translation is complete, all relevent
+            // fields in the request will be populated, but not in the packet.
+            // here we create the new packet so we can set the size, addr,
+            // and proper flags.
+            PacketPtr oldPkt = pkt;
+            pkt = new Packet(oldPkt->req, oldPkt->cmd);
+            delete oldPkt;
+            pkt->dataStatic(tmpData);
+
+
+            // New SenderState for the memory access
+            pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
+                                                             index, nullptr);
+
+            gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
+            gpuDynInst->tlbHitLevel[index] = hit_level;
+
+
+            // translation is done. Schedule the mem_req_event at the
+            // appropriate cycle to send the timing memory request to ruby
+            ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+                new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
+                    "scheduled\n", cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
+
+            schedule(mem_req_event, curTick() + req_tick_latency);
+        } else if (tlbPort[tlbPort_index]->isStalled()) {
+            assert(tlbPort[tlbPort_index]->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet will be issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            tlbPort[tlbPort_index]->stallPort();
+
+            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+                    tmp_vaddr);
+
+            tlbPort[tlbPort_index]->retries.push_back(pkt);
+        } else {
+           DPRINTF(GPUTLB,
+                   "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
+                   cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
+        }
+    } else {
+        if (pkt->cmd == MemCmd::MemFenceReq) {
+            gpuDynInst->statusBitVector = VectorMask(0);
+        } else {
+            gpuDynInst->statusBitVector &= (~(1ll << index));
+        }
+
+        // New SenderState for the memory access
+        delete pkt->senderState;
+
+        // Because it's atomic operation, only need TLB translation state
+        pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                                shader->gpuTc);
+
+        tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+        // the addr of the packet is not modified, so we need to create a new
+        // packet, or otherwise the memory access will have the old virtual
+        // address sent in the translation packet, instead of the physical
+        // address returned by the translation.
+        PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
+        new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+
+        // Translation is done. It is safe to send the packet to memory.
+        memPort[0]->sendFunctional(new_pkt);
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
+                gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+                new_pkt->req->getPaddr());
+
+        // safe_cast the senderState
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete new_pkt;
+        delete pkt->senderState;
+        delete pkt->req;
+        delete pkt;
+    }
+}
+
+void
+ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+                                                              nullptr);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+            cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+            pkt->req->getPaddr());
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+}
+
+void
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+                                  Request* req)
+{
+    if (!req) {
+        req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
+    }
+    req->setPaddr(0);
+    if (kernelLaunch) {
+        req->setFlags(Request::KERNEL);
+    }
+
+    gpuDynInst->s_type = SEG_GLOBAL;
+
+    // for non-kernel MemFence operations, memorder flags are set depending
+    // on which type of request is currently being sent, so this
+    // should be set by the caller (e.g. if an inst has acq-rel
+    // semantics, it will send one acquire req an one release req)
+    gpuDynInst->setRequestFlags(req, kernelLaunch);
+
+    // a mem fence must correspond to an acquire/release request
+    assert(req->isAcquire() || req->isRelease());
+
+    // create packet
+    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+
+    // set packet's sender state
+    pkt->senderState =
+        new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+
+    // send the packet
+    sendSyncRequest(gpuDynInst, 0, pkt);
+}
+
+const char*
+ComputeUnit::DataPort::MemRespEvent::description() const
+{
+    return "ComputeUnit memory response event";
+}
+
+void
+ComputeUnit::DataPort::MemRespEvent::process()
+{
+    DataPort::SenderState *sender_state =
+        safe_cast<DataPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit = dataPort->computeUnit;
+
+    assert(gpuDynInst);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
+            compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+            pkt->req->getPaddr(), dataPort->index);
+
+    Addr paddr = pkt->req->getPaddr();
+
+    if (pkt->cmd != MemCmd::MemFenceResp) {
+        int index = gpuDynInst->memStatusVector[paddr].back();
+
+        DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+                pkt->req->getPaddr(), index);
+
+        gpuDynInst->memStatusVector[paddr].pop_back();
+        gpuDynInst->pAddr = pkt->req->getPaddr();
+
+        if (pkt->isRead() || pkt->isWrite()) {
+
+            if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
+                gpuDynInst->statusBitVector &= (~(1ULL << index));
+            } else {
+                assert(gpuDynInst->statusVector[index] > 0);
+                gpuDynInst->statusVector[index]--;
+
+                if (!gpuDynInst->statusVector[index])
+                    gpuDynInst->statusBitVector &= (~(1ULL << index));
+            }
+
+            DPRINTF(GPUMem, "bitvector is now %#x\n",
+                    gpuDynInst->statusBitVector);
+
+            if (gpuDynInst->statusBitVector == VectorMask(0)) {
+                auto iter = gpuDynInst->memStatusVector.begin();
+                auto end = gpuDynInst->memStatusVector.end();
+
+                while (iter != end) {
+                    assert(iter->second.empty());
+                    ++iter;
+                }
+
+                gpuDynInst->memStatusVector.clear();
+
+                if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    gpuDynInst->statusVector.clear();
+
+                if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
+                    || MO_ANR(gpuDynInst->m_op)) {
+                    assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
+                        .push(gpuDynInst);
+                } else {
+                    assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
+
+                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
+                        .push(gpuDynInst);
+                }
+
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                        compute_unit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+
+                // after clearing the status vectors,
+                // see if there is a continuation to perform
+                // the continuation may generate more work for
+                // this memory request
+                if (gpuDynInst->useContinuation) {
+                    assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+                    gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                                 gpuDynInst);
+                }
+            }
+        }
+    } else {
+        gpuDynInst->statusBitVector = VectorMask(0);
+
+        if (gpuDynInst->useContinuation) {
+            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+                                         gpuDynInst);
+        }
+    }
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+ComputeUnit*
+ComputeUnitParams::create()
+{
+    return new ComputeUnit(this);
+}
+
+bool
+ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line = pkt->req->getPaddr();
+
+    DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
+            pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+    computeUnit->tlbCycles += curTick();
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // no PageFaults are permitted for data accesses
+    if (!translation_state->tlbEntry->valid) {
+        DTLBPort::SenderState *sender_state =
+            safe_cast<DTLBPort::SenderState*>(translation_state->saved);
+
+        Wavefront *w M5_VAR_USED =
+            computeUnit->wfList[sender_state->_gpuDynInst->simdId]
+            [sender_state->_gpuDynInst->wfSlotId];
+
+        DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
+                 pkt->req->getVaddr());
+    }
+
+    assert(translation_state->tlbEntry->valid);
+
+    // update the hitLevel distribution
+    int hit_level = translation_state->hitLevel;
+    computeUnit->hitsPerTLBLevel[hit_level]++;
+
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+
+    // for prefetch pkt
+    BaseTLB::Mode TLB_mode = translation_state->tlbMode;
+
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    DTLBPort::SenderState *sender_state =
+        safe_cast<DTLBPort::SenderState*>(pkt->senderState);
+
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    int mp_index = sender_state->portIndex;
+    Addr vaddr = pkt->req->getVaddr();
+    gpuDynInst->memStatusVector[line].push_back(mp_index);
+    gpuDynInst->tlbHitLevel[mp_index] = hit_level;
+
+    MemCmd requestCmd;
+
+    if (pkt->cmd == MemCmd::ReadResp) {
+        requestCmd = MemCmd::ReadReq;
+    } else if (pkt->cmd == MemCmd::WriteResp) {
+        requestCmd = MemCmd::WriteReq;
+    } else if (pkt->cmd == MemCmd::SwapResp) {
+        requestCmd = MemCmd::SwapReq;
+    } else {
+        panic("unsupported response to request conversion %s\n",
+              pkt->cmd.toString());
+    }
+
+    if (computeUnit->prefetchDepth) {
+        int simdId = gpuDynInst->simdId;
+        int wfSlotId = gpuDynInst->wfSlotId;
+        Addr last = 0;
+
+        switch(computeUnit->prefetchType) {
+          case Enums::PF_CU:
+            last = computeUnit->lastVaddrCU[mp_index];
+            break;
+          case Enums::PF_PHASE:
+            last = computeUnit->lastVaddrPhase[simdId][mp_index];
+            break;
+          case Enums::PF_WF:
+            last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
+          default:
+            break;
+        }
+
+        DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
+                computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
+
+        int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
+                     roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
+                     : 0;
+
+        DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
+
+        computeUnit->lastVaddrCU[mp_index] = vaddr;
+        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+        computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
+
+        stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
+            computeUnit->prefetchStride: stride;
+
+        DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
+                computeUnit->cu_id, simdId, wfSlotId, mp_index);
+
+        DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
+
+        // Prefetch Next few pages atomically
+        for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
+            DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
+                    vaddr+stride*pf*TheISA::PageBytes);
+
+            if (!stride)
+                break;
+
+            Request *prefetch_req = new Request(0, vaddr + stride * pf *
+                                                TheISA::PageBytes,
+                                                sizeof(uint8_t), 0,
+                                                computeUnit->masterId(),
+                                                0, 0, 0);
+
+            PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
+            uint8_t foo = 0;
+            prefetch_pkt->dataStatic(&foo);
+
+            // Because it's atomic operation, only need TLB translation state
+            prefetch_pkt->senderState =
+                new TheISA::GpuTLB::TranslationState(TLB_mode,
+                                                     computeUnit->shader->gpuTc,
+                                                     true);
+
+            // Currently prefetches are zero-latency, hence the sendFunctional
+            sendFunctional(prefetch_pkt);
+
+            /* safe_cast the senderState */
+            TheISA::GpuTLB::TranslationState *tlb_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(
+                         prefetch_pkt->senderState);
+
+
+            delete tlb_state->tlbEntry;
+            delete tlb_state;
+            delete prefetch_pkt->req;
+            delete prefetch_pkt;
+        }
+    }
+
+    // First we must convert the response cmd back to a request cmd so that
+    // the request can be sent through the cu's master port
+    PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
+    new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+    delete pkt->senderState;
+    delete pkt;
+
+    // New SenderState for the memory access
+    new_pkt->senderState =
+            new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
+                                                   nullptr);
+
+    // translation is done. Schedule the mem_req_event at the appropriate
+    // cycle to send the timing memory request to ruby
+    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+        new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
+                                               new_pkt);
+
+    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
+            computeUnit->cu_id, gpuDynInst->simdId,
+            gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
+
+    computeUnit->schedule(mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
+    return true;
+}
+
+const char*
+ComputeUnit::DataPort::MemReqEvent::description() const
+{
+    return "ComputeUnit memory request event";
+}
+
+void
+ComputeUnit::DataPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+    ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
+
+    if (!(dataPort->sendTimingReq(pkt))) {
+        dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
+
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    } else {
+        DPRINTF(GPUPort,
+                "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, dataPort->index,
+                pkt->req->getPaddr());
+    }
+}
+
+/*
+ * The initial translation request could have been rejected,
+ * if <retries> queue is not Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::DTLBPort::recvReqRetry()
+{
+    int len = retries.size();
+
+    DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, len);
+
+    assert(len > 0);
+    assert(isStalled());
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+bool
+ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
+{
+    Addr line M5_VAR_USED = pkt->req->getPaddr();
+    DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
+            computeUnit->cu_id, pkt->req->getVaddr(), line);
+
+    assert(pkt->senderState);
+
+    // pop off the TLB translation state
+    TheISA::GpuTLB::TranslationState *translation_state =
+                 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool success = translation_state->tlbEntry->valid;
+    delete translation_state->tlbEntry;
+    assert(!translation_state->ports.size());
+    pkt->senderState = translation_state->saved;
+    delete translation_state;
+
+    // use the original sender state to know how to close this transaction
+    ITLBPort::SenderState *sender_state =
+        safe_cast<ITLBPort::SenderState*>(pkt->senderState);
+
+    // get the wavefront associated with this translation request
+    Wavefront *wavefront = sender_state->wavefront;
+    delete pkt->senderState;
+
+    if (success) {
+        // pkt is reused in fetch(), don't delete it here.  However, we must
+        // reset the command to be a request so that it can be sent through
+        // the cu's master port
+        assert(pkt->cmd == MemCmd::ReadResp);
+        pkt->cmd = MemCmd::ReadReq;
+
+        computeUnit->fetchStage.fetch(pkt, wavefront);
+    } else {
+        if (wavefront->dropFetch) {
+            assert(wavefront->instructionBuffer.empty());
+            wavefront->dropFetch = false;
+        }
+
+        wavefront->pendingFetch = 0;
+    }
+
+    return true;
+}
+
+/*
+ * The initial translation request could have been rejected, if
+ * <retries> queue is not empty. Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::ITLBPort::recvReqRetry()
+{
+
+    int len = retries.size();
+    DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
+
+    assert(len > 0);
+    assert(isStalled());
+
+    // recvReqRetry is an indication that the resource on which this
+    // port was stalling on is freed. So, remove the stall first
+    unstallPort();
+
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = retries.front();
+        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+        DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
+
+        if (!sendTimingReq(pkt)) {
+            stallPort(); // Stall port
+            DPRINTF(GPUTLB, ": failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": successful\n");
+            retries.pop_front();
+        }
+    }
+}
+
+void
+ComputeUnit::regStats()
+{
+    tlbCycles
+        .name(name() + ".tlb_cycles")
+        .desc("total number of cycles for all uncoalesced requests")
+        ;
+
+    tlbRequests
+        .name(name() + ".tlb_requests")
+        .desc("number of uncoalesced requests")
+        ;
+
+    tlbLatency
+        .name(name() + ".avg_translation_latency")
+        .desc("Avg. translation latency for data translations")
+        ;
+
+    tlbLatency = tlbCycles / tlbRequests;
+
+    hitsPerTLBLevel
+       .init(4)
+       .name(name() + ".TLB_hits_distribution")
+       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
+       ;
+
+    // fixed number of TLB levels
+    for (int i = 0; i < 4; ++i) {
+        if (!i)
+            hitsPerTLBLevel.subname(i,"page_table");
+        else
+            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+    }
+
+    execRateDist
+        .init(0, 10, 2)
+        .name(name() + ".inst_exec_rate")
+        .desc("Instruction Execution Rate: Number of executed vector "
+              "instructions per cycle")
+        ;
+
+    ldsBankConflictDist
+       .init(0, VSZ, 2)
+       .name(name() + ".lds_bank_conflicts")
+       .desc("Number of bank conflicts per LDS memory packet")
+       ;
+
+    ldsBankAccesses
+        .name(name() + ".lds_bank_access_cnt")
+        .desc("Total number of LDS bank accesses")
+        ;
+
+    pageDivergenceDist
+       // A wavefront can touch 1 to VSZ pages per memory instruction.
+       // The number of pages per bin can be configured (here it's 4).
+       .init(1, VSZ, 4)
+       .name(name() + ".page_divergence_dist")
+       .desc("pages touched per wf (over all mem. instr.)")
+       ;
+
+    controlFlowDivergenceDist
+        .init(1, VSZ, 4)
+        .name(name() + ".warp_execution_dist")
+        .desc("number of lanes active per instruction (oval all instructions)")
+        ;
+
+    activeLanesPerGMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".gmem_lanes_execution_dist")
+        .desc("number of active lanes per global memory instruction")
+        ;
+
+    activeLanesPerLMemInstrDist
+        .init(1, VSZ, 4)
+        .name(name() + ".lmem_lanes_execution_dist")
+        .desc("number of active lanes per local memory instruction")
+        ;
+
+    numInstrExecuted
+        .name(name() + ".num_instr_executed")
+        .desc("number of instructions executed")
+        ;
+
+    numVecOpsExecuted
+        .name(name() + ".num_vec_ops_executed")
+        .desc("number of vec ops executed (e.g. VSZ/inst)")
+        ;
+
+    totalCycles
+        .name(name() + ".num_total_cycles")
+        .desc("number of cycles the CU ran for")
+        ;
+
+    ipc
+        .name(name() + ".ipc")
+        .desc("Instructions per cycle (this CU only)")
+        ;
+
+    vpc
+        .name(name() + ".vpc")
+        .desc("Vector Operations per cycle (this CU only)")
+        ;
+
+    numALUInstsExecuted
+        .name(name() + ".num_alu_insts_executed")
+        .desc("Number of dynamic non-GM memory insts executed")
+        ;
+
+    wgBlockedDueLdsAllocation
+        .name(name() + ".wg_blocked_due_lds_alloc")
+        .desc("Workgroup blocked due to LDS capacity")
+        ;
+
+    ipc = numInstrExecuted / totalCycles;
+    vpc = numVecOpsExecuted / totalCycles;
+
+    numTimesWgBlockedDueVgprAlloc
+        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
+        .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+        ;
+
+    dynamicGMemInstrCnt
+        .name(name() + ".global_mem_instr_cnt")
+        .desc("dynamic global memory instructions count")
+        ;
+
+    dynamicLMemInstrCnt
+        .name(name() + ".local_mem_instr_cnt")
+        .desc("dynamic local memory intruction count")
+        ;
+
+    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+        dynamicLMemInstrCnt;
+
+    completedWfs
+        .name(name() + ".num_completed_wfs")
+        .desc("number of completed wavefronts")
+        ;
+
+    numCASOps
+        .name(name() + ".num_CAS_ops")
+        .desc("number of compare and swap operations")
+        ;
+
+    numFailedCASOps
+        .name(name() + ".num_failed_CAS_ops")
+        .desc("number of compare and swap operations that failed")
+        ;
+
+    // register stats of pipeline stages
+    fetchStage.regStats();
+    scoreboardCheckStage.regStats();
+    scheduleStage.regStats();
+    execStage.regStats();
+
+    // register stats of memory pipeline
+    globalMemoryPipe.regStats();
+    localMemoryPipe.regStats();
+}
+
+void
+ComputeUnit::updatePageDivergenceDist(Addr addr)
+{
+    Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
+
+    if (!pagesTouched.count(virt_page_addr))
+        pagesTouched[virt_page_addr] = 1;
+    else
+        pagesTouched[virt_page_addr]++;
+}
+
+void
+ComputeUnit::CUExitCallback::process()
+{
+    if (computeUnit->countPages) {
+        std::ostream *page_stat_file =
+            simout.create(computeUnit->name().c_str());
+
+        *page_stat_file << "page, wavefront accesses, workitem accesses" <<
+            std::endl;
+
+        for (auto iter : computeUnit->pageAccesses) {
+            *page_stat_file << std::hex << iter.first << ",";
+            *page_stat_file << std::dec << iter.second.first << ",";
+            *page_stat_file << std::dec << iter.second.second << std::endl;
+        }
+    }
+ }
+
+bool
+ComputeUnit::isDone() const
+{
+    for (int i = 0; i < numSIMDs; ++i) {
+        if (!isSimdDone(i)) {
+            return false;
+        }
+    }
+
+    bool glbMemBusRdy = true;
+    for (int j = 0; j < numGlbMemUnits; ++j) {
+        glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+    }
+    bool locMemBusRdy = true;
+    for (int j = 0; j < numLocMemUnits; ++j) {
+        locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+    }
+
+    if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
+        !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+        || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
+        !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+        return false;
+    }
+
+    return true;
+}
+
+int32_t
+ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+{
+    return lds.getRefCounter(dispatchId, wgId);
+}
+
+bool
+ComputeUnit::isSimdDone(uint32_t simdId) const
+{
+    assert(simdId < numSIMDs);
+
+    for (int i=0; i < numGlbMemUnits; ++i) {
+        if (!vrfToGlobalMemPipeBus[i].rdy())
+            return false;
+    }
+    for (int i=0; i < numLocMemUnits; ++i) {
+        if (!vrfToLocalMemPipeBus[i].rdy())
+            return false;
+    }
+    if (!aluPipe[simdId].rdy()) {
+        return false;
+    }
+
+    for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
+        if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * send a general request to the LDS
+ * make sure to look at the return value here as your request might be
+ * NACK'd and returning false means that you have to have some backup plan
+ */
+bool
+ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
+{
+    // this is just a request to carry the GPUDynInstPtr
+    // back and forth
+    Request *newRequest = new Request();
+    newRequest->setPaddr(0x0);
+
+    // ReadReq is not evaluted by the LDS but the Packet ctor requires this
+    PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
+
+    // This is the SenderState needed upon return
+    newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
+
+    return ldsPort->sendTimingReq(newPacket);
+}
+
+/**
+ * get the result of packets sent to the LDS when they return
+ */
+bool
+ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
+{
+    const ComputeUnit::LDSPort::SenderState *senderState =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
+
+    fatal_if(!senderState, "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    delete packet->senderState;
+    delete packet->req;
+    delete packet;
+
+    computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
+    return true;
+}
+
+/**
+ * attempt to send this packet, either the port is already stalled, the request
+ * is nack'd and must stall or the request goes through
+ * when a request cannot be sent, add it to the retries queue
+ */
+bool
+ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
+{
+    ComputeUnit::LDSPort::SenderState *sender_state =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
+    fatal_if(!sender_state, "packet without a valid sender state");
+
+    GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+
+    if (isStalled()) {
+        fatal_if(retries.empty(), "must have retries waiting to be stalled");
+
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
+                        computeUnit->cu_id, gpuDynInst->simdId,
+                        gpuDynInst->wfSlotId);
+        return false;
+    } else if (!MasterPort::sendTimingReq(pkt)) {
+        // need to stall the LDS port until a recvReqRetry() is received
+        // this indicates that there is more space
+        stallPort();
+        retries.push(pkt);
+
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return false;
+    } else {
+        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
+                computeUnit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId, pkt->req->getPaddr());
+        return true;
+    }
+}
+
+/**
+ * the bus is telling the port that there is now space so retrying stalled
+ * requests should work now
+ * this allows the port to have a request be nack'd and then have the receiver
+ * say when there is space, rather than simply retrying the send every cycle
+ */
+void
+ComputeUnit::LDSPort::recvReqRetry()
+{
+    auto queueSize = retries.size();
+
+    DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
+            computeUnit->cu_id, queueSize);
+
+    fatal_if(queueSize < 1,
+             "why was there a recvReqRetry() with no pending reqs?");
+    fatal_if(!isStalled(),
+             "recvReqRetry() happened when the port was not stalled");
+
+    unstallPort();
+
+    while (!retries.empty()) {
+        PacketPtr packet = retries.front();
+
+        DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
+
+        if (!MasterPort::sendTimingReq(packet)) {
+            // Stall port
+            stallPort();
+            DPRINTF(GPUPort, ": LDS send failed again\n");
+            break;
+        } else {
+            DPRINTF(GPUTLB, ": LDS send successful\n");
+            retries.pop();
+        }
+    }
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+    OLDEST = 0,
+    RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+    SIMD0 = 0,
+    SIMD1,
+    SIMD2,
+    SIMD3,
+    GLBMEM_PIPE,
+    LDSMEM_PIPE,
+    NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+    TLB_MISS_CACHE_MISS = 0,
+    TLB_MISS_CACHE_HIT,
+    TLB_HIT_CACHE_MISS,
+    TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+  public:
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+
+    // Buffers used to communicate between various pipeline stages
+
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list. readyList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: make enum to index readyList
+    std::vector<std::vector<Wavefront*>> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList. waveStatusList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+    int rrNextALUWp;
+    typedef ComputeUnitParams Params;
+    std::vector<std::vector<Wavefront*>> wfList;
+    int cu_id;
+
+    // array of vector register files, one per SIMD
+    std::vector<VectorRegisterFile*> vrf;
+    // Number of vector ALU units (SIMDs) in CU
+    int numSIMDs;
+    // number of pipe stages for bypassing data to next dependent single
+    // precision vector instruction inside the vector ALU pipeline
+    int spBypassPipeLength;
+    // number of pipe stages for bypassing data to next dependent double
+    // precision vector instruction inside the vector ALU pipeline
+    int dpBypassPipeLength;
+    // number of cycles per issue period
+    int issuePeriod;
+
+    // Number of global and local memory execution resources in CU
+    int numGlbMemUnits;
+    int numLocMemUnits;
+    // tracks the last cycle a vector instruction was executed on a SIMD
+    std::vector<uint64_t> lastExecCycle;
+
+    // true if we allow a separate TLB per lane
+    bool perLaneTLB;
+    // if 0, TLB prefetching is off.
+    int prefetchDepth;
+    // if fixed-stride prefetching, this is the stride.
+    int prefetchStride;
+
+    class LastVaddrWave
+    {
+      public:
+        Addr vaddrs[VSZ];
+        Addr& operator[](int idx) {
+            return vaddrs[idx];
+        }
+
+        LastVaddrWave() {
+            for (int i = 0; i < VSZ; ++i)
+                vaddrs[i] = 0;
+        }
+    };
+
+    LastVaddrWave lastVaddrCU;
+    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+    Enums::PrefetchType prefetchType;
+    EXEC_POLICY exec_policy;
+
+    bool xact_cas_mode;
+    bool debugSegFault;
+    bool functionalTLB;
+    bool localMemBarrier;
+
+    /*
+     * for Counting page accesses
+     *
+     * cuExitCallback inherits from Callback. When you register a callback
+     * function as an exit callback, it will get added to an exit callback
+     * queue, such that on simulation exit, all callbacks in the callback
+     * queue will have their process() function called.
+     */
+    bool countPages;
+
+    Shader *shader;
+    uint32_t barrier_id;
+    // vector of Vector ALU (MACC) pipelines
+    std::vector<WaitClass> aluPipe;
+    // minimum issue period per SIMD unit (in cycles)
+    std::vector<WaitClass> wfWait;
+
+    // Resource control for Vector Register File->Global Memory pipe buses
+    std::vector<WaitClass> vrfToGlobalMemPipeBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    std::vector<WaitClass> vrfToLocalMemPipeBus;
+    int nextGlbMemBus;
+    int nextLocMemBus;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+
+    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
+    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
+
+    Tick req_tick_latency;
+    Tick resp_tick_latency;
+
+    // number of vector registers being reserved for each SIMD unit
+    std::vector<int> vectorRegsReserved;
+    // number of vector registers per SIMD unit
+    uint32_t numVecRegsPerSimd;
+    // Support for scheduling VGPR status update events
+    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+    std::vector<uint64_t> timestampVec;
+    std::vector<uint8_t>  statusVec;
+
+    void
+    registerEvent(uint32_t simdId,
+                  uint32_t regIdx,
+                  uint32_t operandSize,
+                  uint64_t when,
+                  uint8_t newStatus) {
+        regIdxVec.push_back(std::make_pair(simdId, regIdx));
+        timestampVec.push_back(when);
+        statusVec.push_back(newStatus);
+        if (operandSize > 4) {
+            regIdxVec.push_back(std::make_pair(simdId,
+                                               ((regIdx + 1) %
+                                                numVecRegsPerSimd)));
+            timestampVec.push_back(when);
+            statusVec.push_back(newStatus);
+        }
+    }
+
+    void updateEvents();
+
+    // this hash map will keep track of page divergence
+    // per memory instruction per wavefront. The hash map
+    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+    std::map<Addr, int> pagesTouched;
+
+    ComputeUnit(const Params *p);
+    ~ComputeUnit();
+    int spBypassLength() { return spBypassPipeLength; };
+    int dpBypassLength() { return dpBypassPipeLength; };
+    int storeBusLength() { return numCyclesPerStoreTransfer; };
+    int loadBusLength() { return numCyclesPerLoadTransfer; };
+    int wfSize() const { return wavefrontSize; };
+
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    void exec();
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void FillKernelState(Wavefront *w, NDRange *ndr);
+
+    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                 int trueWgSizeTotal);
+
+    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                             int trueWgSize[], int trueWgSizeTotal,
+                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+    void StartWorkgroup(NDRange *ndr);
+    int ReadyWorkgroup(NDRange *ndr);
+
+    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+    int GlbMemUnitId() { return GLBMEM_PIPE; }
+    int ShrMemUnitId() { return LDSMEM_PIPE; }
+    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+    /* This function cycles through all the wavefronts in all the phases to see
+     * if all of the wavefronts which should be associated with one barrier
+     * (denoted with _barrier_id), are all at the same barrier in the program
+     * (denoted by bcnt). When the number at the barrier matches bslots, then
+     * return true.
+     */
+    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+    bool cedeSIMD(int simdId, int wfSlotId);
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    virtual void init();
+    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+                              bool kernelLaunch=true,
+                              RequestPtr req=nullptr);
+    void handleMemPacket(PacketPtr pkt, int memport_index);
+    bool processTimingPacket(PacketPtr pkt);
+    void processFetchReturn(PacketPtr pkt);
+    void updatePageDivergenceDist(Addr addr);
+
+    MasterID masterId() { return _masterId; }
+
+    bool isDone() const;
+    bool isSimdDone(uint32_t) const;
+
+  protected:
+    MasterID _masterId;
+
+    LdsState &lds;
+
+  public:
+    // the following stats compute the avg. TLB accesslatency per
+    // uncoalesced request (only for data)
+    Stats::Scalar tlbRequests;
+    Stats::Scalar tlbCycles;
+    Stats::Formula tlbLatency;
+    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+    Stats::Vector hitsPerTLBLevel;
+
+    Stats::Scalar ldsBankAccesses;
+    Stats::Distribution ldsBankConflictDist;
+
+    // over all memory instructions executed over all wavefronts
+    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+    Stats::Distribution pageDivergenceDist;
+    Stats::Scalar dynamicGMemInstrCnt;
+    Stats::Scalar dynamicLMemInstrCnt;
+
+    Stats::Scalar wgBlockedDueLdsAllocation;
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+    // when the instruction is committed, this number is still incremented by 1
+    Stats::Scalar numInstrExecuted;
+    // Number of cycles among successive instruction executions across all
+    // wavefronts of the same CU
+    Stats::Distribution execRateDist;
+    // number of individual vector operations executed
+    Stats::Scalar numVecOpsExecuted;
+    // Total cycles that something is running on the GPU
+    Stats::Scalar totalCycles;
+    Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula ipc; // vector instructions per cycle
+    Stats::Distribution controlFlowDivergenceDist;
+    Stats::Distribution activeLanesPerGMemInstrDist;
+    Stats::Distribution activeLanesPerLMemInstrDist;
+    // number of vector ALU instructions received
+    Stats::Formula numALUInstsExecuted;
+    // number of times a WG can not start due to lack of free VGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    Stats::Scalar numCASOps;
+    Stats::Scalar numFailedCASOps;
+    Stats::Scalar completedWfs;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer: Defined in the Scoreboard stage, consumed
+    // by the Execute stage.
+    std::vector<bool> vectorAluInstAvail;
+    // number of available (oldest) LDS instructions that could have
+    // been issued to the LDS at a specific issue slot
+    int shrMemInstAvail;
+    // number of available Global memory instructions that could have
+    // been issued to TCP at a specific issue slot
+    int glbMemInstAvail;
+
+    void
+    regStats();
+
+    LdsState &
+    getLds() const
+    {
+        return lds;
+    }
+
+    int32_t
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+    bool
+    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+    pageDataStruct pageAccesses;
+
+    class CUExitCallback : public Callback
+    {
+      private:
+        ComputeUnit *computeUnit;
+
+      public:
+        virtual ~CUExitCallback() { }
+
+        CUExitCallback(ComputeUnit *_cu)
+        {
+            computeUnit = _cu;
+        }
+
+        virtual void
+        process();
+    };
+
+    CUExitCallback *cuExitCallback;
+
+    /** Data access Port **/
+    class DataPort : public MasterPort
+    {
+      public:
+        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            GPUDynInstPtr _gpuDynInst;
+            int port_index;
+            Packet::SenderState *saved;
+
+            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst),
+                  port_index(_port_index),
+                  saved(sender_state) { }
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        class MemRespEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+
+    };
+
+    // Instruction cache access port
+    class SQCPort : public MasterPort
+    {
+      public:
+        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            Wavefront *wavefront;
+            Packet::SenderState *saved;
+
+            SenderState(Wavefront *_wavefront, Packet::SenderState
+                    *sender_state=nullptr)
+                : wavefront(_wavefront), saved(sender_state) { }
+        };
+
+        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+     };
+
+    /** Data TLB port **/
+    class DTLBPort : public MasterPort
+    {
+      public:
+        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index), stalled(false)
+        { }
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // the memInst that this is associated with
+            GPUDynInstPtr _gpuDynInst;
+
+            // the lane in the memInst this is associated with, so we send
+            // the memory request down the right port
+            int portIndex;
+
+            // constructor used for packets involved in timing accesses
+            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    class ITLBPort : public MasterPort
+    {
+      public:
+        ITLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // The wavefront associated with this request
+            Wavefront *wavefront;
+
+            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    /**
+     * the port intended to communicate between the CU and its LDS
+     */
+    class LDSPort : public MasterPort
+    {
+      public:
+        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+        : MasterPort(_name, _cu, _id), computeUnit(_cu)
+        {
+        }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the requests that were
+         * not successfully sent.
+         */
+        std::queue<PacketPtr> retries;
+
+        /**
+         *  SenderState is information carried along with the packet, esp. the
+         *  GPUDynInstPtr
+         */
+        class SenderState: public Packet::SenderState
+        {
+          protected:
+            // The actual read/write/atomic request that goes with this command
+            GPUDynInstPtr _gpuDynInst = nullptr;
+
+          public:
+            SenderState(GPUDynInstPtr gpuDynInst):
+              _gpuDynInst(gpuDynInst)
+            {
+            }
+
+            GPUDynInstPtr
+            getMemInst() const
+            {
+              return _gpuDynInst;
+            }
+        };
+
+        virtual bool
+        sendTimingReq(PacketPtr pkt);
+
+      protected:
+
+        bool stalled = false; ///< whether or not it is stalled
+
+        ComputeUnit *computeUnit;
+
+        virtual bool
+        recvTimingResp(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt) { return 0; }
+
+        virtual void
+        recvFunctional(PacketPtr pkt)
+        {
+        }
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvReqRetry();
+    };
+
+    /** The port to access the Local Data Store
+     *  Can be connected to a LDS object
+     */
+    LDSPort *ldsPort = nullptr;
+
+    LDSPort *
+    getLdsPort() const
+    {
+        return ldsPort;
+    }
+
+    /** The memory port for SIMD data accesses.
+     *  Can be connected to PhysMem for Ruby for timing simulations
+     */
+    std::vector<DataPort*> memPort;
+    // port to the TLB hierarchy (i.e., the L1 TLB)
+    std::vector<DTLBPort*> tlbPort;
+    // port to the SQC (i.e. the I-cache)
+    SQCPort *sqcPort;
+    // port to the SQC TLB (there's a separate TLB for each I-cache)
+    ITLBPort *sqcTLBPort;
+
+    virtual BaseMasterPort&
+    getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "memory_port") {
+            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *memPort[idx];
+        } else if (if_name == "translation_port") {
+            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *tlbPort[idx];
+        } else if (if_name == "sqc_port") {
+            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+                                  this, idx);
+            return *sqcPort;
+        } else if (if_name == "sqc_tlb_port") {
+            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+            return *sqcTLBPort;
+        } else if (if_name == "ldsPort") {
+            if (ldsPort) {
+                fatal("an LDS port was already allocated");
+            }
+            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+            return *ldsPort;
+        } else {
+            panic("incorrect port name");
+        }
+    }
+
+    // xact_cas_load()
+    class waveIdentifier
+    {
+      public:
+        waveIdentifier() { }
+        waveIdentifier(int _simdId, int _wfSlotId)
+          : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+        int simdId;
+        int wfSlotId;
+    };
+
+    class waveQueue
+    {
+      public:
+        std::list<waveIdentifier> waveIDQueue;
+    };
+    std::map<unsigned, waveQueue> xactCasLoadMap;
+
+    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+  private:
+    uint64_t globalSeqNum;
+    int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc
new file mode 100644
index 000000000..f3f2d2927
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/condition_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ConditionRegisterState::ConditionRegisterState()
+{
+    computeUnit = nullptr;
+    c_reg.clear();
+    busy.clear();
+}
+
+void
+ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".CondRegState";
+}
+
+void
+ConditionRegisterState::init(uint32_t _size)
+{
+    c_reg.resize(_size);
+    busy.resize(_size, 0);
+}
+
+void
+ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
+{
+    // iterate over all operands
+    for (auto i = 0; i < ii->getNumOperands(); ++i) {
+        // is this a condition register destination operand?
+        if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
+            // mark the register as busy
+            markReg(ii->getRegisterIndex(i), 1);
+            uint32_t pipeLen =  w->computeUnit->spBypassLength();
+
+            // schedule an event for marking the register as ready
+            w->computeUnit->
+                registerEvent(w->simdId, ii->getRegisterIndex(i),
+                              ii->getOperandSize(i),
+                              w->computeUnit->shader->tick_cnt +
+                              w->computeUnit->shader->ticks(pipeLen), 0);
+        }
+    }
+}
diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh
new file mode 100644
index 000000000..139874a66
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __CONDITION_REGISTER_STATE_HH__
+#define __CONDITION_REGISTER_STATE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+class GPUStaticInst;
+class Shader;
+class Wavefront;
+
+// Condition Register State (used only when executing HSAIL)
+class ConditionRegisterState
+{
+  public:
+    ConditionRegisterState();
+    void init(uint32_t _size);
+    const std::string name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    template<typename T>
+    T
+    read(int regIdx, int threadId)
+    {
+        bool tmp = c_reg[regIdx][threadId];
+        T *p0 = (T*)(&tmp);
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(int regIdx, int threadId, T value)
+    {
+        c_reg[regIdx][threadId] = (bool)(value & 0x01);
+    }
+
+    void
+    markReg(int regIdx, uint8_t value)
+    {
+        busy.at(regIdx) = value;
+    }
+
+    uint8_t
+    regBusy(int idx)
+    {
+        uint8_t status = busy.at(idx);
+        return status;
+    }
+
+    int numRegs() { return c_reg.size(); }
+    void exec(GPUStaticInst *ii, Wavefront *w);
+
+  private:
+    ComputeUnit* computeUnit;
+    std::string _name;
+    // Condition Register state
+    std::vector<VectorMask> c_reg;
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+};
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
new file mode 100644
index 000000000..55e4be72a
--- /dev/null
+++ b/src/gpu-compute/dispatcher.cc
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+
+#include "gpu-compute/dispatcher.hh"
+
+#include "cpu/base.hh"
+#include "debug/GPUDisp.hh"
+#include "gpu-compute/cl_driver.hh"
+#include "gpu-compute/cl_event.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet_access.hh"
+
+GpuDispatcher *GpuDispatcher::instance = nullptr;
+
+GpuDispatcher::GpuDispatcher(const Params *p)
+    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
+      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
+      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
+      shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
+{
+    shader->handshake(this);
+    driver->handshake(this);
+
+    ndRange.wg_disp_rem = false;
+    ndRange.globalWgId = 0;
+
+    schedule(&tickEvent, 0);
+
+    // translation port for the dispatcher
+    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+
+    num_kernelLaunched
+    .name(name() + ".num_kernel_launched")
+    .desc("number of kernel launched")
+    ;
+}
+
+GpuDispatcher *GpuDispatcherParams::create()
+{
+    GpuDispatcher *dispatcher = new GpuDispatcher(this);
+    GpuDispatcher::setInstance(dispatcher);
+
+    return GpuDispatcher::getInstance();
+}
+
+void
+GpuDispatcher::serialize(CheckpointOut &cp) const
+{
+    Tick event_tick = 0;
+
+    if (ndRange.wg_disp_rem)
+        fatal("Checkpointing not supported during active workgroup execution");
+
+    if (tickEvent.scheduled())
+        event_tick = tickEvent.when();
+
+    SERIALIZE_SCALAR(event_tick);
+
+}
+
+void
+GpuDispatcher::unserialize(CheckpointIn &cp)
+{
+    Tick event_tick;
+
+    if (tickEvent.scheduled())
+        deschedule(&tickEvent);
+
+    UNSERIALIZE_SCALAR(event_tick);
+
+    if (event_tick)
+        schedule(&tickEvent, event_tick);
+}
+
+AddrRangeList
+GpuDispatcher::getAddrRanges() const
+{
+    AddrRangeList ranges;
+
+    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
+            pioAddr, pioSize);
+
+    ranges.push_back(RangeSize(pioAddr, pioSize));
+
+    return ranges;
+}
+
+Tick
+GpuDispatcher::read(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+    pkt->allocate();
+
+    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+
+    if (offset < 8) {
+        assert(!offset);
+        assert(pkt->getSize() == 8);
+
+        uint64_t retval = dispatchActive;
+        pkt->set(retval);
+    } else {
+        offset -= 8;
+        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+
+        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+Tick
+GpuDispatcher::write(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr);
+    assert(pkt->getAddr() < pioAddr + pioSize);
+
+    int offset = pkt->getAddr() - pioAddr;
+
+#if TRACING_ON
+    uint64_t data_val = 0;
+
+    switch (pkt->getSize()) {
+      case 1:
+        data_val = pkt->get<uint8_t>();
+        break;
+      case 2:
+        data_val = pkt->get<uint16_t>();
+        break;
+      case 4:
+        data_val = pkt->get<uint32_t>();
+        break;
+      case 8:
+        data_val = pkt->get<uint64_t>();
+        break;
+      default:
+        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
+    }
+
+    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
+            pkt->getSize());
+#endif
+    if (!offset) {
+        static int nextId = 0;
+
+        // The depends field of the qstruct, which was previously unused, is
+        // used to communicate with simulated application.
+        if (curTask.depends) {
+            HostState hs;
+            shader->ReadMem((uint64_t)(curTask.depends), &hs,
+                            sizeof(HostState), 0);
+
+            // update event start time (in nano-seconds)
+            uint64_t start = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
+                             &start, sizeof(uint64_t), 0);
+        }
+
+        // launch kernel
+        ++num_kernelLaunched;
+
+        NDRange *ndr = &(ndRangeMap[nextId]);
+        // copy dispatch info
+        ndr->q = curTask;
+
+        // update the numDispTask polled by the runtime
+        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+
+        ndr->numWgTotal = 1;
+
+        for (int i = 0; i < 3; ++i) {
+            ndr->wgId[i] = 0;
+            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
+            ndr->numWgTotal *= ndr->numWg[i];
+        }
+
+        ndr->numWgCompleted = 0;
+        ndr->globalWgId = 0;
+        ndr->wg_disp_rem = true;
+        ndr->execDone = false;
+        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
+        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
+        ndr->dispatchId = nextId;
+        ndr->curTid = pkt->req->threadId();
+        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
+        execIds.push(nextId);
+        ++nextId;
+
+        dispatchActive = true;
+
+        if (!tickEvent.scheduled()) {
+            schedule(&tickEvent, curTick() + shader->ticks(1));
+        }
+    } else {
+        // populate current task struct
+        // first 64 bits are launch reg
+        offset -= 8;
+        assert(offset < sizeof(HsaQueueEntry));
+        char *curTaskPtr = (char*)&curTask;
+        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
+    }
+
+    pkt->makeAtomicResponse();
+
+    return pioDelay;
+}
+
+
+BaseMasterPort&
+GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "translation_port") {
+        return *tlbPort;
+    }
+
+    return DmaDevice::getMasterPort(if_name, idx);
+}
+
+void
+GpuDispatcher::exec()
+{
+    int fail_count = 0;
+
+    // There are potentially multiple outstanding kernel launches.
+    // It is possible that the workgroups in a different kernel
+    // can fit on the GPU even if another kernel's workgroups cannot
+    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+
+    while (execIds.size() > fail_count) {
+        int execId = execIds.front();
+
+        while (ndRangeMap[execId].wg_disp_rem) {
+            //update the thread context
+            shader->updateThreadContext(ndRangeMap[execId].curTid);
+
+            // attempt to dispatch_workgroup
+            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
+                // if we failed try the next kernel,
+                // it may have smaller workgroups.
+                // put it on the queue to rety latter
+                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
+                execIds.push(execId);
+                ++fail_count;
+                break;
+            }
+        }
+        // let's try the next kernel_id
+        execIds.pop();
+    }
+
+    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+
+    if (doneIds.size() && cpu) {
+        shader->hostWakeUp(cpu);
+    }
+
+    while (doneIds.size()) {
+        // wakeup the CPU if any Kernels completed this cycle
+        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+        doneIds.pop();
+    }
+}
+
+void
+GpuDispatcher::notifyWgCompl(Wavefront *w)
+{
+    int kern_id = w->kern_id;
+    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
+    assert(ndRangeMap[kern_id].dispatchId == kern_id);
+    ndRangeMap[kern_id].numWgCompleted++;
+
+    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
+        ndRangeMap[kern_id].execDone = true;
+        doneIds.push(kern_id);
+
+        if (ndRangeMap[kern_id].addrToNotify) {
+            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
+                          0);
+        }
+
+        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+
+        // update event end time (in nano-seconds)
+        if (ndRangeMap[kern_id].q.depends) {
+            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
+            uint64_t event;
+            shader->ReadMem((uint64_t)(&host_state->event), &event,
+                            sizeof(uint64_t), 0);
+
+            uint64_t end = curTick() / 1000;
+
+            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
+                             sizeof(uint64_t), 0);
+        }
+    }
+
+    if (!tickEvent.scheduled()) {
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+    }
+}
+
+void
+GpuDispatcher::scheduleDispatch()
+{
+    if (!tickEvent.scheduled())
+        schedule(&tickEvent, curTick() + shader->ticks(1));
+}
+
+void
+GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+{
+    if (cpu) {
+        if (off) {
+            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
+                              true);
+            val += off;
+        }
+
+        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
+    } else {
+        panic("Cannot find host");
+    }
+}
+
+GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
+    : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
+{
+}
+
+void
+GpuDispatcher::TickEvent::process()
+{
+    dispatcher->exec();
+}
+
+const char*
+GpuDispatcher::TickEvent::description() const
+{
+    return "GPU Dispatcher tick";
+}
+
+// helper functions for driver to retrieve GPU attributes
+int
+GpuDispatcher::getNumCUs()
+{
+    return shader->cuList.size();
+}
+
+void
+GpuDispatcher::setFuncargsSize(int funcargs_size)
+{
+    shader->funcargs_size = funcargs_size;
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
new file mode 100644
index 000000000..76f932655
--- /dev/null
+++ b/src/gpu-compute/dispatcher.hh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __GPU_DISPATCHER_HH__
+#define __GPU_DISPATCHER_HH__
+
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "dev/dma_device.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/port.hh"
+#include "params/GpuDispatcher.hh"
+
+class BaseCPU;
+class Shader;
+
+class GpuDispatcher : public DmaDevice
+{
+    public:
+        typedef GpuDispatcherParams Params;
+
+        class TickEvent : public Event
+        {
+            private:
+                GpuDispatcher *dispatcher;
+
+            public:
+                TickEvent(GpuDispatcher *);
+                void process();
+                const char *description() const;
+        };
+
+        MasterID masterId() { return _masterId; }
+
+    protected:
+        MasterID _masterId;
+
+        // Base and length of PIO register space
+        Addr pioAddr;
+        Addr pioSize;
+        Tick pioDelay;
+
+        HsaQueueEntry curTask;
+
+        std::unordered_map<int, NDRange> ndRangeMap;
+        NDRange ndRange;
+
+        // list of kernel_ids to launch
+        std::queue<int> execIds;
+        // list of kernel_ids that have finished
+        std::queue<int> doneIds;
+
+        uint64_t dispatchCount;
+        // is there a kernel in execution?
+        bool dispatchActive;
+
+        BaseCPU *cpu;
+        Shader *shader;
+        ClDriver *driver;
+        TickEvent tickEvent;
+
+        static GpuDispatcher *instance;
+
+        // sycall emulation mode can have only 1 application running(?)
+        // else we have to do some pid based tagging
+        // unused
+        typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
+        TranslationBuffer tlb;
+
+    public:
+        /*statistics*/
+        Stats::Scalar num_kernelLaunched;
+        GpuDispatcher(const Params *p);
+
+        ~GpuDispatcher() { }
+
+        void exec();
+        virtual void serialize(CheckpointOut &cp) const;
+        virtual void unserialize(CheckpointIn &cp);
+        void notifyWgCompl(Wavefront *w);
+        void scheduleDispatch();
+        void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
+
+        // using singleton so that glue code can pass pointer locations
+        // to the dispatcher. when there are multiple dispatchers, we can
+        // call something like getInstance(index)
+        static void
+         setInstance(GpuDispatcher *_instance)
+        {
+            instance = _instance;
+        }
+
+        static GpuDispatcher* getInstance() { return instance; }
+
+        class TLBPort : public MasterPort
+        {
+          public:
+
+            TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
+                : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
+
+          protected:
+            GpuDispatcher *dispatcher;
+
+            virtual bool recvTimingResp(PacketPtr pkt) { return true; }
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry() { }
+
+        };
+
+        TLBPort *tlbPort;
+
+        virtual BaseMasterPort& getMasterPort(const std::string &if_name,
+                                              PortID idx);
+
+        AddrRangeList getAddrRanges() const;
+        Tick read(PacketPtr pkt);
+        Tick write(PacketPtr pkt);
+
+        // helper functions to retrieve/set GPU attributes
+        int getNumCUs();
+        void setFuncargsSize(int funcargs_size);
+};
+
+#endif // __GPU_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
new file mode 100644
index 000000000..c2b95f85e
--- /dev/null
+++ b/src/gpu-compute/exec_stage.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/exec_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
+    numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+    vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
+    shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+    thisTimeInstExecuted(false), instrExecuted (false),
+    executionResourcesUsed(0)
+{
+    numTransActiveIdle = 0;
+    idle_dur = 0;
+}
+
+void
+ExecStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ExecStage";
+    dispatchList = &computeUnit->dispatchList;
+    vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
+    glbMemInstAvail= &(computeUnit->glbMemInstAvail);
+    shrMemInstAvail= &(computeUnit->shrMemInstAvail);
+    idle_dur = 0;
+}
+
+void
+ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
+    if (stage == IdleExec) {
+        // count cycles of no vector ALU instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+        }
+
+        // count cycles of no global memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*glbMemInstAvail)--;
+        }
+
+        // count cycles of no shared memory (vector) instruction executed
+        // even if one was the oldest in a WV of that vector SIMD unit
+        if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
+            numCyclesWithNoInstrTypeIssued[unitId]++;
+            (*shrMemInstAvail)--;
+        }
+    } else if (stage == BusyExec) {
+        // count the number of cycles an instruction to a specific unit
+        // was issued
+        numCyclesWithInstrTypeIssued[unitId]++;
+        thisTimeInstExecuted = true;
+        instrExecuted = true;
+        ++executionResourcesUsed;
+    } else if (stage == PostExec) {
+        // count the number of transitions from active to idle
+        if (lastTimeInstExecuted && !thisTimeInstExecuted) {
+            ++numTransActiveIdle;
+        }
+
+        if (!lastTimeInstExecuted && thisTimeInstExecuted) {
+            idleDur.sample(idle_dur);
+            idle_dur = 0;
+        } else if (!thisTimeInstExecuted) {
+            idle_dur++;
+        }
+
+        lastTimeInstExecuted = thisTimeInstExecuted;
+        // track the number of cycles we either issued one vector instruction
+        // or issued no instructions at all
+        if (instrExecuted) {
+            numCyclesWithInstrIssued++;
+        } else {
+            numCyclesWithNoIssue++;
+        }
+
+        spc.sample(executionResourcesUsed);
+    }
+}
+
+void
+ExecStage::initStatistics()
+{
+    instrExecuted = false;
+    executionResourcesUsed = 0;
+    thisTimeInstExecuted = false;
+}
+
+void
+ExecStage::exec()
+{
+    initStatistics();
+
+    for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
+         // if dispatch list for this execution resource is empty,
+         // skip this execution resource this cycle
+         if (dispatchList->at(unitId).second == EMPTY) {
+             collectStatistics(IdleExec, unitId);
+             continue;
+         }
+
+         collectStatistics(BusyExec, unitId);
+         // execute an instruction for the WF
+         dispatchList->at(unitId).first->exec();
+         // clear the dispatch list entry
+         dispatchList->at(unitId).second = EMPTY;
+         dispatchList->at(unitId).first = (Wavefront*)nullptr;
+    }
+
+    collectStatistics(PostExec, 0);
+}
+
+void
+ExecStage::regStats()
+{
+    numTransActiveIdle
+       .name(name() + ".num_transitions_active_to_idle")
+       .desc("number of CU transitions from active to idle")
+        ;
+
+    numCyclesWithNoIssue
+        .name(name() + ".num_cycles_with_no_issue")
+        .desc("number of cycles the CU issues nothing")
+        ;
+
+    numCyclesWithInstrIssued
+        .name(name() + ".num_cycles_with_instr_issued")
+        .desc("number of cycles the CU issued at least one instruction")
+        ;
+
+    spc
+        .init(0, numSIMDs + numMemUnits, 1)
+        .name(name() + ".spc")
+        .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
+        ;
+
+    idleDur
+        .init(0,75,5)
+        .name(name() + ".idle_duration_in_cycles")
+        .desc("duration of idle periods in cycles")
+        ;
+
+    numCyclesWithInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+        .name(name() + ".num_cycles_with_instrtype_issue")
+        .desc("Number of cycles at least one instruction of specific type "
+              "issued")
+        ;
+
+    numCyclesWithNoInstrTypeIssued
+        .init(numSIMDs + numMemUnits)
+       .name(name() + ".num_cycles_with_instr_type_no_issue")
+       .desc("Number of cycles no instruction of specific type issued")
+       ;
+
+    for (int i = 0; i < numSIMDs; ++i) {
+        numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+        numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+    }
+
+    numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+    numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+    numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+}
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
new file mode 100644
index 000000000..2de74366b
--- /dev/null
+++ b/src/gpu-compute/exec_stage.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __EXEC_STAGE_HH__
+#define __EXEC_STAGE_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+struct ComputeUnitParams;
+
+enum STAT_STATUS
+{
+    IdleExec,
+    BusyExec,
+    PostExec
+};
+
+enum DISPATCH_STATUS
+{
+    EMPTY = 0,
+    FILLED
+};
+
+// Execution stage.
+// Each execution resource executes the
+// wave which is in its dispatch list.
+// The schedule stage is responsible for
+// adding a wave into each execution resource's
+// dispatch list.
+
+class ExecStage
+{
+  public:
+    ExecStage(const ComputeUnitParams* params);
+    ~ExecStage() { }
+    void init(ComputeUnit *cu);
+    void exec();
+
+    std::string name() { return _name; }
+    void regStats();
+    // number of idle cycles
+    Stats::Scalar numCyclesWithNoIssue;
+    // number of busy cycles
+    Stats::Scalar numCyclesWithInstrIssued;
+    // number of cycles (per execution unit) during which at least one
+    // instruction was issued to that unit
+    Stats::Vector numCyclesWithInstrTypeIssued;
+    // number of idle cycles (per execution unit) during which the unit issued
+    // no instruction targeting that unit, even though there is at least one
+    // Wavefront with such an instruction as the oldest
+    Stats::Vector numCyclesWithNoInstrTypeIssued;
+    // SIMDs active per cycle
+    Stats::Distribution spc;
+
+  private:
+    void collectStatistics(enum STAT_STATUS stage, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+
+    // Number of memory execution resources;
+    // both global and local memory execution resources in CU
+    uint32_t numMemUnits;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    bool lastTimeInstExecuted;
+    bool thisTimeInstExecuted;
+    bool instrExecuted;
+    Stats::Scalar  numTransActiveIdle;
+    Stats::Distribution idleDur;
+    uint32_t executionResourcesUsed;
+    uint64_t idle_dur;
+    std::string _name;
+};
+
+#endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
new file mode 100644
index 000000000..1f5e6ded3
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
+    computeUnit(nullptr)
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        FetchUnit newFetchUnit(p);
+        fetchUnit.push_back(newFetchUnit);
+    }
+}
+
+FetchStage::~FetchStage()
+{
+    fetchUnit.clear();
+}
+
+void
+FetchStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".FetchStage";
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        fetchUnit[j].init(computeUnit);
+    }
+}
+
+void
+FetchStage::exec()
+{
+    for (int j = 0; j < numSIMDs; ++j) {
+        fetchUnit[j].exec();
+    }
+}
+
+void
+FetchStage::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    const unsigned num_instructions = pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst);
+
+    instFetchInstReturned.sample(num_instructions);
+    uint32_t simdId = wavefront->simdId;
+    fetchUnit[simdId].processFetchReturn(pkt);
+}
+
+void
+FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+}
+
+void
+FetchStage::regStats()
+{
+    instFetchInstReturned
+        .init(1, 32, 1)
+        .name(name() + ".inst_fetch_instr_returned")
+        .desc("For each instruction fetch request recieved record how many "
+              "instructions you got from it")
+        ;
+}
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
new file mode 100644
index 000000000..ce7faa8ac
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_STAGE_HH__
+#define __FETCH_STAGE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/fetch_unit.hh"
+
+// Instruction fetch stage.
+// All dispatched wavefronts for all SIMDS are analyzed for the
+// need to fetch instructions. From the fetch eligible waves,
+// one wave is selected from each SIMD and fetch is initiated
+// for the selected waves.
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchStage
+{
+  public:
+    FetchStage(const ComputeUnitParams* params);
+    ~FetchStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void processFetchReturn(PacketPtr pkt);
+    void fetch(PacketPtr pkt, Wavefront *wave);
+
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+    Stats::Distribution instFetchInstReturned;
+
+  private:
+    uint32_t numSIMDs;
+    ComputeUnit *computeUnit;
+
+    // List of fetch units. A fetch unit is
+    // instantiated per SIMD
+    std::vector<FetchUnit> fetchUnit;
+    std::string _name;
+};
+
+#endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
new file mode 100644
index 000000000..1f0a7d78e
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_unit.hh"
+
+#include "debug/GPUFetch.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+uint32_t FetchUnit::globalFetchUnitID;
+
+FetchUnit::FetchUnit(const ComputeUnitParams* params) :
+    timingSim(true),
+    computeUnit(nullptr),
+    fetchScheduler(params),
+    waveList(nullptr)
+{
+}
+
+FetchUnit::~FetchUnit()
+{
+    fetchQueue.clear();
+    fetchStatusQueue.clear();
+}
+
+void
+FetchUnit::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    timingSim = computeUnit->shader->timingSim;
+    fetchQueue.clear();
+    fetchStatusQueue.resize(computeUnit->shader->n_wf);
+
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+    }
+
+    fetchScheduler.bindList(&fetchQueue);
+}
+
+void
+FetchUnit::exec()
+{
+    // re-evaluate waves which are marked as not ready for fetch
+    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+        // Following code assumes 64-bit opertaion and all insts are
+        // represented by 64-bit pointers to inst objects.
+        Wavefront *curWave = fetchStatusQueue[j].first;
+        assert (curWave);
+
+        // The wavefront has to be active, the IB occupancy has to be
+        // 4 or less instructions and it can not have any branches to
+        // prevent speculative instruction fetches
+        if (!fetchStatusQueue[j].second) {
+            if (curWave->status == Wavefront::S_RUNNING &&
+                curWave->instructionBuffer.size() <= 4 &&
+                !curWave->instructionBufferHasBranch() &&
+                !curWave->pendingFetch) {
+                fetchQueue.push_back(curWave);
+                fetchStatusQueue[j].second = true;
+            }
+        }
+    }
+
+    // Fetch only if there is some wave ready to be fetched
+    // An empty fetchQueue will cause the schedular to panic
+    if (fetchQueue.size()) {
+        Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
+        waveToBeFetched->pendingFetch = true;
+        fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
+        initiateFetch(waveToBeFetched);
+    }
+}
+
+void
+FetchUnit::initiateFetch(Wavefront *wavefront)
+{
+    // calculate the virtual address to fetch from the SQC
+    Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
+    vaddr = wavefront->base_ptr +  vaddr * sizeof(GPUStaticInst*);
+
+    DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+
+    // Since this is an instruction prefetch, if you're split then just finish
+    // out the current line.
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    // check for split accesses
+    Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
+    unsigned size = block_size;
+
+    if (split_addr > vaddr) {
+        // misaligned access, just grab the rest of the line
+        size = split_addr - vaddr;
+    }
+
+    // set up virtual request
+    Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
+                               computeUnit->masterId(), 0, 0, 0);
+
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    // This fetchBlock is kind of faux right now - because the translations so
+    // far don't actually return Data
+    uint64_t fetchBlock;
+    pkt->dataStatic(&fetchBlock);
+
+    if (timingSim) {
+        // SenderState needed on Return
+        pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
+
+        // Sender State needed by TLB hierarchy
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc,
+                                                 false, pkt->senderState);
+
+        if (computeUnit->sqcTLBPort->isStalled()) {
+            assert(computeUnit->sqcTLBPort->retries.size() > 0);
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+            // Stall the data port;
+            // No more packet is issued till
+            // ruby indicates resources are freed by
+            // a recvReqRetry() call back on this port.
+            computeUnit->sqcTLBPort->stallPort();
+
+            DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+                    vaddr);
+
+            computeUnit->sqcTLBPort->retries.push_back(pkt);
+        } else {
+            DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
+        }
+    } else {
+        pkt->senderState =
+            new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+                                                 computeUnit->shader->gpuTc);
+
+        computeUnit->sqcTLBPort->sendFunctional(pkt);
+
+        TheISA::GpuTLB::TranslationState *sender_state =
+             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+        delete sender_state->tlbEntry;
+        delete sender_state;
+        // fetch the instructions from the SQC when we operate in
+        // functional mode only
+        fetch(pkt, wavefront);
+    }
+}
+
+void
+FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+    assert(pkt->req->hasPaddr());
+    assert(pkt->req->hasSize());
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
+            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+            pkt->req->getPaddr());
+
+    // this is necessary because the GPU TLB receives packets instead of
+    // requests. when the translation is complete, all relevent fields in the
+    // request will be populated, but not in the packet. here we create the
+    // new packet so we can set the size, addr, and proper flags.
+    PacketPtr oldPkt = pkt;
+    pkt = new Packet(oldPkt->req, oldPkt->cmd);
+    delete oldPkt;
+
+    TheGpuISA::RawMachInst *data =
+        new TheGpuISA::RawMachInst[pkt->req->getSize() /
+        sizeof(TheGpuISA::RawMachInst)];
+
+    pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+
+    // New SenderState for the memory access
+    pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
+
+    if (timingSim) {
+        // translation is done. Send the appropriate timing memory request.
+
+        if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
+            computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+                                                                   wavefront));
+
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        } else {
+            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
+                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    pkt->req->getPaddr());
+        }
+    } else {
+        computeUnit->sqcPort->sendFunctional(pkt);
+        processFetchReturn(pkt);
+    }
+}
+
+void
+FetchUnit::processFetchReturn(PacketPtr pkt)
+{
+    ComputeUnit::SQCPort::SenderState *sender_state =
+        safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+    Wavefront *wavefront = sender_state->wavefront;
+
+    DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
+            "%d bytes, %d instructions!\n", computeUnit->cu_id,
+            wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
+            pkt->req->getSize(), pkt->req->getSize() /
+            sizeof(TheGpuISA::RawMachInst));
+
+    if (wavefront->dropFetch) {
+        assert(wavefront->instructionBuffer.empty());
+        wavefront->dropFetch = false;
+    } else {
+        TheGpuISA::RawMachInst *inst_index_ptr =
+            (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
+
+        assert(wavefront->instructionBuffer.size() <= 4);
+
+        for (int i = 0; i < pkt->req->getSize() /
+             sizeof(TheGpuISA::RawMachInst); ++i) {
+            GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
+
+            assert(inst_ptr);
+            DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
+                    computeUnit->cu_id, wavefront->simdId,
+                    wavefront->wfSlotId, inst_ptr->disassemble());
+
+            GPUDynInstPtr gpuDynInst =
+                std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
+                                             computeUnit->getAndIncSeqNum());
+
+            wavefront->instructionBuffer.push_back(gpuDynInst);
+        }
+    }
+
+    wavefront->pendingFetch = false;
+
+    delete pkt->senderState;
+    delete pkt->req;
+    delete pkt;
+}
+
+void
+FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
+{
+    waveList = wave_list;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
new file mode 100644
index 000000000..c7c6afb3c
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_UNIT_HH__
+#define __FETCH_UNIT_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "base/statistics.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/scheduler.hh"
+#include "mem/packet.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchUnit
+{
+  public:
+    FetchUnit(const ComputeUnitParams* params);
+    ~FetchUnit();
+    void init(ComputeUnit *cu);
+    void exec();
+    void bindWaveList(std::vector<Wavefront*> *list);
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void processFetchReturn(PacketPtr pkt);
+    static uint32_t globalFetchUnitID;
+
+  private:
+    bool timingSim;
+    ComputeUnit *computeUnit;
+    TheGpuISA::Decoder decoder;
+
+    // Fetch scheduler; Selects one wave from
+    // the fetch queue for instruction fetching.
+    // The selection is made according to
+    // a scheduling policy
+    Scheduler fetchScheduler;
+
+    // Stores the list of waves that are
+    // ready to be fetched this cycle
+    std::vector<Wavefront*> fetchQueue;
+
+    // Stores the fetch status of all waves dispatched to this SIMD.
+    // TRUE implies the wave is ready to fetch and is already
+    // moved to fetchQueue
+    std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
+
+    // Pointer to list of waves dispatched on to this SIMD unit
+    std::vector<Wavefront*> *waveList;
+};
+
+#endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
new file mode 100644
index 000000000..913327412
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/global_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
+    inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+GlobalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    globalMemSize = computeUnit->shader->globalMemSize;
+    _name = computeUnit->name() + ".GlobalMemPipeline";
+}
+
+void
+GlobalMemPipeline::exec()
+{
+    // apply any returned global memory operations
+    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
+        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+
+    bool accessVrf = true;
+    // check the VRF to see if the operands of a load (or load component
+    // of an atomic) are accessible
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
+        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+        accessVrf && m->statusBitVector == VectorMask(0) &&
+        (computeUnit->shader->coissue_return ||
+         computeUnit->wfWait.at(m->pipeId).rdy())) {
+
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doGmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doGmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doGmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doGmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doGmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doGmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doGmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doGmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doGmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doGmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doGmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doGmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doGmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doGmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doGmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doGmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doGmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doGmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doGmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a global memory instruction
+    // execute global memory packets and issue global
+    // memory packets to DTLB
+    if (!gmIssuedRequests.empty()) {
+        GPUDynInstPtr mp = gmIssuedRequests.front();
+        if (mp->m_op == Enums::MO_LD ||
+            (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
+            (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
+
+            if (inflightLoads >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightLoads;
+            }
+        } else {
+            if (inflightStores >= gmQueueSize) {
+                return;
+            } else {
+                ++inflightStores;
+            }
+        }
+
+        mp->initiateAcc(mp);
+        gmIssuedRequests.pop();
+
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
+                computeUnit->cu_id, mp->simdId, mp->wfSlotId,
+                Enums::MemOpTypeStrings[mp->m_op]);
+    }
+}
+
+template<typename c0, typename c1>
+void
+GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
+{
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        gmReturnedLoads.pop();
+        assert(inflightLoads > 0);
+        --inflightLoads;
+
+        if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+            std::vector<uint32_t> regVec;
+            // iterate over number of destination register operands since
+            // this is a load or atomic operation
+            for (int k = 0; k < m->n_reg; ++k) {
+                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
+                int dst = m->dst_reg + k;
+
+                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                    dst = m->dst_reg_vec[k];
+                // virtual->physical VGPR mapping
+                int physVgpr = w->remap(dst, sizeof(c0), 1);
+                // save the physical VGPR index
+                regVec.push_back(physVgpr);
+                c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+                for (int i = 0; i < VSZ; ++i) {
+                    if (m->exec_mask[i]) {
+                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+                                "$%s%d <- %d global ld done (src = wavefront "
+                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+                                dst, *p1);
+                        // write the value into the physical VGPR. This is a
+                        // purely functional operation. No timing is modeled.
+                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                    *p1, i);
+                    }
+                    ++p1;
+                }
+            }
+
+            // Schedule the write operation of the load data on the VRF.
+            // This simply models the timing aspect of the VRF write operation.
+            // It does not modify the physical VGPR.
+            loadVrfBankConflictCycles +=
+                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
+                                                     w, regVec, sizeof(c0),
+                                                     m->time);
+        }
+    } else {
+        gmReturnedStores.pop();
+        assert(inflightStores > 0);
+        --inflightStores;
+    }
+
+    // Decrement outstanding register count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
+        MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
+                                         -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
+                                         -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->glbMemToVrfBus.set(m->time);
+    if (!computeUnit->shader->coissue_return)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+GlobalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles GM data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
new file mode 100644
index 000000000..ed49f6f6b
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
+#define __GLOBAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file global_memory_pipeline.hh
+ *
+ * The global memory pipeline issues newly created global memory packets
+ * from the pipeline to DTLB. The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This stage also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class GlobalMemPipeline
+{
+  public:
+    GlobalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
+    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+
+    bool
+    isGMLdRespFIFOWrRdy() const
+    {
+        return gmReturnedLoads.size() < gmQueueSize;
+    }
+
+    bool
+    isGMStRespFIFOWrRdy() const
+    {
+        return gmReturnedStores.size() < gmQueueSize;
+    }
+
+    bool
+    isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
+    }
+
+    const std::string &name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int gmQueueSize;
+
+    // number of cycles of delaying the update of a VGPR that is the
+    // target of a load instruction (or the load component of an atomic)
+    // The delay is due to VRF bank conflicts
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Counters to track the inflight loads and stores
+    // so that we can provide the proper backpressure
+    // on the number of inflight memory operations.
+    int inflightStores;
+    int inflightLoads;
+
+    // The size of global memory.
+    int globalMemSize;
+
+    // Global Memory Request FIFO: all global memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+    // Globa Store Response FIFO: all responses of global memory
+    // stores are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedStores;
+
+    // Global Load Response FIFO: all responses of global memory
+    // loads are sent to this FIFO from TCP
+    std::queue<GPUDynInstPtr> gmReturnedLoads;
+};
+
+#endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
new file mode 100644
index 000000000..83e348dbe
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
+                       GPUStaticInst *_staticInst, uint64_t instSeqNum)
+    : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+      memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
+      statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
+{
+    tlbHitLevel.assign(VSZ, -1);
+}
+
+void
+GPUDynInst::execute()
+{
+    GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
+                                                            _seqNum);
+    staticInst->execute(gpuDynInst);
+}
+
+int
+GPUDynInst::numSrcRegOperands()
+{
+    return staticInst->numSrcRegOperands();
+}
+
+int
+GPUDynInst::numDstRegOperands()
+{
+    return staticInst->numDstRegOperands();
+}
+
+int
+GPUDynInst::getNumOperands()
+{
+    return staticInst->getNumOperands();
+}
+
+bool
+GPUDynInst::isVectorRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+bool
+GPUDynInst::isScalarRegister(int operandIdx)
+{
+    return staticInst->isVectorRegister(operandIdx);
+}
+
+int
+GPUDynInst::getRegisterIndex(int operandIdx)
+{
+    return staticInst->getRegisterIndex(operandIdx);
+}
+
+int
+GPUDynInst::getOperandSize(int operandIdx)
+{
+    return staticInst->getOperandSize(operandIdx);
+}
+
+bool
+GPUDynInst::isDstOperand(int operandIdx)
+{
+    return staticInst->isDstOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isSrcOperand(int operandIdx)
+{
+    return staticInst->isSrcOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isArgLoad()
+{
+    return staticInst->isArgLoad();
+}
+
+const std::string&
+GPUDynInst::disassemble() const
+{
+    return staticInst->disassemble();
+}
+
+uint64_t
+GPUDynInst::seqNum() const
+{
+    return _seqNum;
+}
+
+Enums::OpType
+GPUDynInst::opType()
+{
+    return staticInst->o_type;
+}
+
+Enums::StorageClassType
+GPUDynInst::executedAs()
+{
+    return staticInst->executed_as;
+}
+
+// Process a memory instruction and (if necessary) submit timing request
+void
+GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+    DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
+            cu->cu_id, simdId, wfSlotId, exec_mask);
+
+    staticInst->initiateAcc(gpuDynInst);
+    time = 0;
+}
+
+bool
+GPUDynInst::scalarOp() const
+{
+    return staticInst->scalarOp();
+}
+
+void
+GPUDynInst::updateStats()
+{
+    if (staticInst->isLocalMem()) {
+        // access to LDS (shared) memory
+        cu->dynamicLMemInstrCnt++;
+    } else {
+        // access to global memory
+
+        // update PageDivergence histogram
+        int number_pages_touched = cu->pagesTouched.size();
+        assert(number_pages_touched);
+        cu->pageDivergenceDist.sample(number_pages_touched);
+
+        std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
+
+        for (auto it : cu->pagesTouched) {
+            // see if this page has been touched before. if not, this also
+            // inserts the page into the table.
+            ret = cu->pageAccesses
+                .insert(ComputeUnit::pageDataStruct::value_type(it.first,
+                        std::make_pair(1, it.second)));
+
+            // if yes, then update the stats
+            if (!ret.second) {
+                ret.first->second.first++;
+                ret.first->second.second += it.second;
+            }
+        }
+
+        cu->pagesTouched.clear();
+
+        // total number of memory instructions (dynamic)
+        // Atomics are counted as a single memory instruction.
+        // this is # memory instructions per wavefronts, not per workitem
+        cu->dynamicGMemInstrCnt++;
+    }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
new file mode 100644
index 000000000..e44d8f80d
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_DYN_INST_HH__
+#define __GPU_DYN_INST_HH__
+
+#include <cstdint>
+#include <string>
+
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_exec_context.hh"
+
+class GPUStaticInst;
+
+template<typename T>
+class AtomicOpAnd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+
+    AtomicOpAnd(T _a) : a(_a) { }
+    void execute(T *b) { *b &= a; }
+};
+
+template<typename T>
+class AtomicOpOr : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpOr(T _a) : a(_a) { }
+    void execute(T *b) { *b |= a; }
+};
+
+template<typename T>
+class AtomicOpXor : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpXor(T _a) : a(_a) {}
+    void execute(T *b) { *b ^= a; }
+};
+
+template<typename T>
+class AtomicOpCAS : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T c;
+    T s;
+
+    ComputeUnit *computeUnit;
+
+    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
+      : c(_c), s(_s), computeUnit(compute_unit) { }
+
+    void
+    execute(T *b)
+    {
+        computeUnit->numCASOps++;
+
+        if (*b == c) {
+            *b = s;
+        } else {
+            computeUnit->numFailedCASOps++;
+        }
+
+        if (computeUnit->xact_cas_mode) {
+            computeUnit->xactCasLoadMap.clear();
+        }
+    }
+};
+
+template<typename T>
+class AtomicOpExch : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpExch(T _a) : a(_a) { }
+    void execute(T *b) { *b = a; }
+};
+
+template<typename T>
+class AtomicOpAdd : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpAdd(T _a) : a(_a) { }
+    void execute(T *b) { *b += a; }
+};
+
+template<typename T>
+class AtomicOpSub : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpSub(T _a) : a(_a) { }
+    void execute(T *b) { *b -= a; }
+};
+
+template<typename T>
+class AtomicOpInc : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpInc() { }
+    void execute(T *b) { *b += 1; }
+};
+
+template<typename T>
+class AtomicOpDec : public TypedAtomicOpFunctor<T>
+{
+  public:
+    AtomicOpDec() {}
+    void execute(T *b) { *b -= 1; }
+};
+
+template<typename T>
+class AtomicOpMax : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMax(T _a) : a(_a) { }
+
+    void
+    execute(T *b)
+    {
+        if (a > *b)
+            *b = a;
+    }
+};
+
+template<typename T>
+class AtomicOpMin : public TypedAtomicOpFunctor<T>
+{
+  public:
+    T a;
+    AtomicOpMin(T _a) : a(_a) {}
+
+    void
+    execute(T *b)
+    {
+        if (a < *b)
+            *b = a;
+    }
+};
+
+#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
+#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
+#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
+
+typedef enum
+{
+    VT_32,
+    VT_64,
+} vgpr_type;
+
+typedef enum
+{
+    SEG_PRIVATE,
+    SEG_SPILL,
+    SEG_GLOBAL,
+    SEG_SHARED,
+    SEG_READONLY,
+    SEG_FLAT
+} seg_type;
+
+class GPUDynInst : public GPUExecContext
+{
+  public:
+    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
+               uint64_t instSeqNum);
+
+    void execute();
+    int numSrcRegOperands();
+    int numDstRegOperands();
+    int getNumOperands();
+    bool isVectorRegister(int operandIdx);
+    bool isScalarRegister(int operandIdx);
+    int getRegisterIndex(int operandIdx);
+    int getOperandSize(int operandIdx);
+    bool isDstOperand(int operandIdx);
+    bool isSrcOperand(int operandIdx);
+    bool isArgLoad();
+
+    const std::string &disassemble() const;
+
+    uint64_t seqNum() const;
+
+    Enums::OpType opType();
+    Enums::StorageClassType executedAs();
+
+    // The address of the memory operation
+    Addr addr[VSZ];
+    Addr pAddr;
+
+    // The data to get written
+    uint8_t d_data[VSZ * 16];
+    // Additional data (for atomics)
+    uint8_t a_data[VSZ * 8];
+    // Additional data (for atomics)
+    uint8_t x_data[VSZ * 8];
+    // The execution mask
+    VectorMask exec_mask;
+
+    // The memory type (M_U32, M_S32, ...)
+    Enums::MemType m_type;
+    // The memory operation (MO_LD, MO_ST, ...)
+    Enums::MemOpType m_op;
+    Enums::GenericMemoryOrder memoryOrder;
+
+    // Scope of the request
+    Enums::GenericMemoryScope scope;
+    // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
+    seg_type s_type;
+    // The equivalency class
+    int equiv;
+    // The return VGPR type (VT_32 or VT_64)
+    vgpr_type v_type;
+    // Number of VGPR's accessed (1, 2, or 4)
+    int n_reg;
+    // The return VGPR index
+    int dst_reg;
+    // There can be max 4 dest regs>
+    int dst_reg_vec[4];
+    // SIMD where the WF of the memory instruction has been mapped to
+    int simdId;
+    // unique id of the WF where the memory instruction belongs to
+    int wfDynId;
+    // The kernel id of the requesting wf
+    int kern_id;
+    // The CU id of the requesting wf
+    int cu_id;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    // execution pipeline id where the memory instruction has been scheduled
+    int pipeId;
+    // The execution time of this operation
+    Tick time;
+    // The latency of this operation
+    WaitClass latency;
+    // A list of bank conflicts for the 4 cycles.
+    uint32_t bc[4];
+
+    // A pointer to ROM
+    uint8_t *rom;
+    // The size of the READONLY segment
+    int sz_rom;
+
+    // Initiate the specified memory operation, by creating a
+    // memory request and sending it off to the memory system.
+    void initiateAcc(GPUDynInstPtr gpuDynInst);
+
+    void updateStats();
+
+    GPUStaticInst* staticInstruction() { return staticInst; }
+
+    // Is the instruction a scalar or vector op?
+    bool scalarOp() const;
+
+    /*
+     * Loads/stores/atomics may have acquire/release semantics associated
+     * withthem. Some protocols want to see the acquire/release as separate
+     * requests from the load/store/atomic. We implement that separation
+     * using continuations (i.e., a function pointer with an object associated
+     * with it). When, for example, the front-end generates a store with
+     * release semantics, we will first issue a normal store and set the
+     * continuation in the GPUDynInst to a function that generate a
+     * release request. That continuation will be called when the normal
+     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
+     * continuation will be called in the context of the same GPUDynInst
+     * that generated the initial store.
+     */
+    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+
+    // when true, call execContinuation when response arrives
+    bool useContinuation;
+
+    template<typename c0> AtomicOpFunctor*
+    makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
+    {
+        using namespace Enums;
+
+        switch(op) {
+          case MO_AAND:
+          case MO_ANRAND:
+            return new AtomicOpAnd<c0>(*reg0);
+          case MO_AOR:
+          case MO_ANROR:
+            return new AtomicOpOr<c0>(*reg0);
+          case MO_AXOR:
+          case MO_ANRXOR:
+            return new AtomicOpXor<c0>(*reg0);
+          case MO_ACAS:
+          case MO_ANRCAS:
+            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
+          case MO_AEXCH:
+          case MO_ANREXCH:
+            return new AtomicOpExch<c0>(*reg0);
+          case MO_AADD:
+          case MO_ANRADD:
+            return new AtomicOpAdd<c0>(*reg0);
+          case MO_ASUB:
+          case MO_ANRSUB:
+            return new AtomicOpSub<c0>(*reg0);
+          case MO_AINC:
+          case MO_ANRINC:
+            return new AtomicOpInc<c0>();
+          case MO_ADEC:
+          case MO_ANRDEC:
+            return new AtomicOpDec<c0>();
+          case MO_AMAX:
+          case MO_ANRMAX:
+            return new AtomicOpMax<c0>(*reg0);
+          case MO_AMIN:
+          case MO_ANRMIN:
+            return new AtomicOpMin<c0>(*reg0);
+          default:
+            panic("Unrecognized atomic operation");
+        }
+    }
+
+    void
+    setRequestFlags(Request *req, bool setMemOrder=true)
+    {
+        // currently these are the easy scopes to deduce
+        switch (s_type) {
+          case SEG_PRIVATE:
+            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
+            break;
+          case SEG_SPILL:
+            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
+            break;
+          case SEG_GLOBAL:
+            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
+            break;
+          case SEG_READONLY:
+            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
+            break;
+          case SEG_SHARED:
+            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
+            break;
+          case SEG_FLAT:
+            // TODO: translate to correct scope
+            assert(false);
+          default:
+            panic("Bad segment type");
+            break;
+        }
+
+        switch (scope) {
+          case Enums::MEMORY_SCOPE_NONE:
+          case Enums::MEMORY_SCOPE_WORKITEM:
+            break;
+          case Enums::MEMORY_SCOPE_WAVEFRONT:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WAVEFRONT_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_WORKGROUP:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::WORKGROUP_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_DEVICE:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::DEVICE_SCOPE);
+            break;
+          case Enums::MEMORY_SCOPE_SYSTEM:
+            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+                                        Request::SYSTEM_SCOPE);
+            break;
+          default:
+            panic("Bad scope type");
+            break;
+        }
+
+        if (setMemOrder) {
+            // set acquire and release flags
+            switch (memoryOrder){
+              case Enums::MEMORY_ORDER_SC_ACQUIRE:
+                req->setFlags(Request::ACQUIRE);
+                break;
+              case Enums::MEMORY_ORDER_SC_RELEASE:
+                req->setFlags(Request::RELEASE);
+                break;
+              case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+                req->setFlags(Request::ACQUIRE | Request::RELEASE);
+                break;
+              default:
+                break;
+            }
+        }
+
+        // set atomic type
+        // currently, the instruction genenerator only produces atomic return
+        // but a magic instruction can produce atomic no return
+        if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
+            m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
+            m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
+            m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
+            m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
+            m_op == Enums::MO_ACAS) {
+            req->setFlags(Request::ATOMIC_RETURN_OP);
+        } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
+                   m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
+                   m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
+                   m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
+                   m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
+                   m_op == Enums::MO_ANRCAS) {
+            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
+        }
+    }
+
+    // Map returned packets and the addresses they satisfy with which lane they
+    // were requested from
+    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
+    StatusVector memStatusVector;
+
+    // Track the status of memory requests per lane, a bit per lane
+    VectorMask statusBitVector;
+    // for ld_v# or st_v#
+    std::vector<int> statusVector;
+    std::vector<int> tlbHitLevel;
+
+  private:
+    GPUStaticInst *staticInst;
+    uint64_t _seqNum;
+};
+
+#endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
new file mode 100644
index 000000000..4af69c41e
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_exec_context.hh"
+
+GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
+    : cu(_cu), wf(_wf)
+{
+}
+
+ComputeUnit*
+GPUExecContext::computeUnit()
+{
+    return cu;
+}
+
+Wavefront*
+GPUExecContext::wavefront()
+{
+    return wf;
+}
diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh
new file mode 100644
index 000000000..a3deb9b8f
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_EXEC_CONTEXT_HH__
+#define __GPU_EXEC_CONTEXT_HH__
+
+class ComputeUnit;
+class Wavefront;
+
+class GPUExecContext
+{
+  public:
+    GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
+    Wavefront* wavefront();
+    ComputeUnit* computeUnit();
+
+  protected:
+    ComputeUnit *cu;
+    Wavefront *wf;
+};
+
+#endif // __GPU_EXEC_CONTEXT_HH__
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
new file mode 100644
index 000000000..bcb8a5f3d
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+GPUStaticInst::GPUStaticInst(const std::string &opcode)
+    : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
+      _instNum(0), _scalarOp(false)
+{
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
new file mode 100644
index 000000000..c1de28427
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_STATIC_INST_HH__
+#define __GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing static instructions for the GPU. The
+ * instructions are "static" because they contain no dynamic instruction
+ * information. GPUStaticInst corresponds to the StaticInst class for the CPU
+ * models.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
+
+class BaseOperand;
+class BaseRegOperand;
+class Wavefront;
+
+class GPUStaticInst
+{
+  public:
+    GPUStaticInst(const std::string &opcode);
+
+    void instNum(int num) { _instNum = num; }
+
+    int instNum() { return _instNum;  }
+
+    void ipdInstNum(int num) { _ipdInstNum = num; }
+
+    int ipdInstNum() const { return _ipdInstNum; }
+
+    virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
+    virtual void generateDisassembly() = 0;
+    virtual const std::string &disassemble() = 0;
+    virtual int getNumOperands() = 0;
+    virtual bool isCondRegister(int operandIndex) = 0;
+    virtual bool isScalarRegister(int operandIndex) = 0;
+    virtual bool isVectorRegister(int operandIndex) = 0;
+    virtual bool isSrcOperand(int operandIndex) = 0;
+    virtual bool isDstOperand(int operandIndex) = 0;
+    virtual int getOperandSize(int operandIndex) = 0;
+    virtual int getRegisterIndex(int operandIndex) = 0;
+    virtual int numDstRegOperands() = 0;
+    virtual int numSrcRegOperands() = 0;
+
+    /*
+     * Most instructions (including all HSAIL instructions)
+     * are vector ops, so _scalarOp will be false by default.
+     * Derived instruction objects that are scalar ops must
+     * set _scalarOp to true in their constructors.
+     */
+    bool scalarOp() const { return _scalarOp; }
+
+    virtual bool isLocalMem() const
+    {
+        fatal("calling isLocalMem() on non-memory instruction.\n");
+
+        return false;
+    }
+
+    bool isArgLoad() { return false; }
+    virtual uint32_t instSize() = 0;
+
+    // only used for memory instructions
+    virtual void
+    initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling initiateAcc() on a non-memory instruction.\n");
+    }
+
+    virtual uint32_t getTargetPc() { return 0; }
+
+    /**
+     * Query whether the instruction is an unconditional jump i.e., the jump
+     * is always executed because there is no condition to be evaluated.
+     *
+     * If the instruction is not of branch type, the result is always false.
+     *
+     * @return True if the instruction is an unconditional jump.
+     */
+    virtual bool unconditionalJumpInstruction() { return false; }
+
+    static uint64_t dynamic_id_count;
+
+    Enums::OpType o_type;
+    // For flat memory accesses
+    Enums::StorageClassType executed_as;
+
+  protected:
+    virtual void
+    execLdAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execSt(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execLdAcq() on a non-load instruction.\n");
+    }
+
+    virtual void
+    execAtomic(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomic() on a non-atomic instruction.\n");
+    }
+
+    virtual void
+    execAtomicAcq(GPUDynInstPtr gpuDynInst)
+    {
+        fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+    }
+
+    const std::string opcode;
+    std::string disassembly;
+    int _instNum;
+    /**
+     * Identifier of the immediate post-dominator instruction.
+     */
+    int _ipdInstNum;
+
+    bool _scalarOp;
+};
+
+#endif // __GPU_STATIC_INST_HH__
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
new file mode 100644
index 000000000..de005fd04
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -0,0 +1,1801 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/gpu_tlb.hh"
+
+#include <cmath>
+#include <cstring>
+
+#include "arch/x86/faults.hh"
+#include "arch/x86/insts/microldstop.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/misc.hh"
+#include "arch/x86/x86_traits.hh"
+#include "base/bitfield.hh"
+#include "base/output.hh"
+#include "base/trace.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUTLB.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "mem/request.hh"
+#include "sim/process.hh"
+
+namespace X86ISA
+{
+
+    GpuTLB::GpuTLB(const Params *p)
+        : MemObject(p), configAddress(0), size(p->size),
+          cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this)
+    {
+        assoc = p->assoc;
+        assert(assoc <= size);
+        numSets = size/assoc;
+        allocationPolicy = p->allocationPolicy;
+        hasMemSidePort = false;
+        accessDistance = p->accessDistance;
+        clock = p->clk_domain->clockPeriod();
+
+        tlb = new GpuTlbEntry[size];
+        std::memset(tlb, 0, sizeof(GpuTlbEntry) * size);
+
+        freeList.resize(numSets);
+        entryList.resize(numSets);
+
+        for (int set = 0; set < numSets; ++set) {
+            for (int way = 0; way < assoc; ++way) {
+                int x = set*assoc + way;
+                freeList[set].push_back(&tlb[x]);
+            }
+        }
+
+        FA = (size == assoc);
+
+        /**
+         * @warning: the set-associative version assumes you have a
+         * fixed page size of 4KB.
+         * If the page size is greather than 4KB (as defined in the
+         * TheISA::PageBytes), then there are various issues w/ the current
+         * implementation (you'd have the same 8KB page being replicated in
+         * different sets etc)
+         */
+        setMask = numSets - 1;
+
+    #if 0
+        // GpuTLB doesn't yet support full system
+        walker = p->walker;
+        walker->setTLB(this);
+    #endif
+
+        maxCoalescedReqs = p->maxOutstandingReqs;
+
+        // Do not allow maxCoalescedReqs to be more than the TLB associativity
+        if (maxCoalescedReqs > assoc) {
+            maxCoalescedReqs = assoc;
+            cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
+        }
+
+        outstandingReqs = 0;
+        hitLatency = p->hitLatency;
+        missLatency1 = p->missLatency1;
+        missLatency2 = p->missLatency2;
+
+        // create the slave ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+            cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+
+        // create the master ports based on the number of connected ports
+        for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+            memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
+                                  name(), i), this, i));
+        }
+    }
+
+    // fixme: this is never called?
+    GpuTLB::~GpuTLB()
+    {
+        // make sure all the hash-maps are empty
+        assert(translationReturnEvent.empty());
+
+        // delete the TLB
+        delete[] tlb;
+    }
+
+    BaseSlavePort&
+    GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "slave") {
+            if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+                panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+            }
+
+            return *cpuSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+        }
+    }
+
+    BaseMasterPort&
+    GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "master") {
+            if (idx >= static_cast<PortID>(memSidePort.size())) {
+                panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+            }
+
+            hasMemSidePort = true;
+
+            return *memSidePort[idx];
+        } else {
+            panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+        }
+    }
+
+    GpuTlbEntry*
+    GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
+    {
+        GpuTlbEntry *newEntry = nullptr;
+
+        /**
+         * vpn holds the virtual page address
+         * The least significant bits are simply masked
+         */
+        int set = (vpn >> TheISA::PageShift) & setMask;
+
+        if (!freeList[set].empty()) {
+            newEntry = freeList[set].front();
+            freeList[set].pop_front();
+        } else {
+            newEntry = entryList[set].back();
+            entryList[set].pop_back();
+        }
+
+        *newEntry = entry;
+        newEntry->vaddr = vpn;
+        entryList[set].push_front(newEntry);
+
+        return newEntry;
+    }
+
+    GpuTLB::EntryList::iterator
+    GpuTLB::lookupIt(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        if (FA) {
+            assert(!set);
+        }
+
+        auto entry = entryList[set].begin();
+        for (; entry != entryList[set].end(); ++entry) {
+            int page_size = (*entry)->size();
+
+            if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
+                DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
+                        "with size %#x.\n", va, (*entry)->vaddr, page_size);
+
+                if (update_lru) {
+                    entryList[set].push_front(*entry);
+                    entryList[set].erase(entry);
+                    entry = entryList[set].begin();
+                }
+
+                break;
+            }
+        }
+
+        return entry;
+    }
+
+    GpuTlbEntry*
+    GpuTLB::lookup(Addr va, bool update_lru)
+    {
+        int set = (va >> TheISA::PageShift) & setMask;
+
+        auto entry = lookupIt(va, update_lru);
+
+        if (entry == entryList[set].end())
+            return nullptr;
+        else
+            return *entry;
+    }
+
+    void
+    GpuTLB::invalidateAll()
+    {
+        DPRINTF(GPUTLB, "Invalidating all entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            while (!entryList[i].empty()) {
+                GpuTlbEntry *entry = entryList[i].front();
+                entryList[i].pop_front();
+                freeList[i].push_back(entry);
+            }
+        }
+    }
+
+    void
+    GpuTLB::setConfigAddress(uint32_t addr)
+    {
+        configAddress = addr;
+    }
+
+    void
+    GpuTLB::invalidateNonGlobal()
+    {
+        DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
+
+        for (int i = 0; i < numSets; ++i) {
+            for (auto entryIt = entryList[i].begin();
+                 entryIt != entryList[i].end();) {
+                if (!(*entryIt)->global) {
+                    freeList[i].push_back(*entryIt);
+                    entryList[i].erase(entryIt++);
+                } else {
+                    ++entryIt;
+                }
+            }
+        }
+    }
+
+    void
+    GpuTLB::demapPage(Addr va, uint64_t asn)
+    {
+
+        int set = (va >> TheISA::PageShift) & setMask;
+        auto entry = lookupIt(va, false);
+
+        if (entry != entryList[set].end()) {
+            freeList[set].push_back(*entry);
+            entryList[set].erase(entry);
+        }
+    }
+
+    Fault
+    GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
+    {
+        DPRINTF(GPUTLB, "Addresses references internal memory.\n");
+        Addr vaddr = req->getVaddr();
+        Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
+
+        if (prefix == IntAddrPrefixCPUID) {
+            panic("CPUID memory space not yet implemented!\n");
+        } else if (prefix == IntAddrPrefixMSR) {
+            vaddr = vaddr >> 3;
+            req->setFlags(Request::MMAPPED_IPR);
+            Addr regNum = 0;
+
+            switch (vaddr & ~IntAddrPrefixMask) {
+              case 0x10:
+                regNum = MISCREG_TSC;
+                break;
+              case 0x1B:
+                regNum = MISCREG_APIC_BASE;
+                break;
+              case 0xFE:
+                regNum = MISCREG_MTRRCAP;
+                break;
+              case 0x174:
+                regNum = MISCREG_SYSENTER_CS;
+                break;
+              case 0x175:
+                regNum = MISCREG_SYSENTER_ESP;
+                break;
+              case 0x176:
+                regNum = MISCREG_SYSENTER_EIP;
+                break;
+              case 0x179:
+                regNum = MISCREG_MCG_CAP;
+                break;
+              case 0x17A:
+                regNum = MISCREG_MCG_STATUS;
+                break;
+              case 0x17B:
+                regNum = MISCREG_MCG_CTL;
+                break;
+              case 0x1D9:
+                regNum = MISCREG_DEBUG_CTL_MSR;
+                break;
+              case 0x1DB:
+                regNum = MISCREG_LAST_BRANCH_FROM_IP;
+                break;
+              case 0x1DC:
+                regNum = MISCREG_LAST_BRANCH_TO_IP;
+                break;
+              case 0x1DD:
+                regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
+                break;
+              case 0x1DE:
+                regNum = MISCREG_LAST_EXCEPTION_TO_IP;
+                break;
+              case 0x200:
+                regNum = MISCREG_MTRR_PHYS_BASE_0;
+                break;
+              case 0x201:
+                regNum = MISCREG_MTRR_PHYS_MASK_0;
+                break;
+              case 0x202:
+                regNum = MISCREG_MTRR_PHYS_BASE_1;
+                break;
+              case 0x203:
+                regNum = MISCREG_MTRR_PHYS_MASK_1;
+                break;
+              case 0x204:
+                regNum = MISCREG_MTRR_PHYS_BASE_2;
+                break;
+              case 0x205:
+                regNum = MISCREG_MTRR_PHYS_MASK_2;
+                break;
+              case 0x206:
+                regNum = MISCREG_MTRR_PHYS_BASE_3;
+                break;
+              case 0x207:
+                regNum = MISCREG_MTRR_PHYS_MASK_3;
+                break;
+              case 0x208:
+                regNum = MISCREG_MTRR_PHYS_BASE_4;
+                break;
+              case 0x209:
+                regNum = MISCREG_MTRR_PHYS_MASK_4;
+                break;
+              case 0x20A:
+                regNum = MISCREG_MTRR_PHYS_BASE_5;
+                break;
+              case 0x20B:
+                regNum = MISCREG_MTRR_PHYS_MASK_5;
+                break;
+              case 0x20C:
+                regNum = MISCREG_MTRR_PHYS_BASE_6;
+                break;
+              case 0x20D:
+                regNum = MISCREG_MTRR_PHYS_MASK_6;
+                break;
+              case 0x20E:
+                regNum = MISCREG_MTRR_PHYS_BASE_7;
+                break;
+              case 0x20F:
+                regNum = MISCREG_MTRR_PHYS_MASK_7;
+                break;
+              case 0x250:
+                regNum = MISCREG_MTRR_FIX_64K_00000;
+                break;
+              case 0x258:
+                regNum = MISCREG_MTRR_FIX_16K_80000;
+                break;
+              case 0x259:
+                regNum = MISCREG_MTRR_FIX_16K_A0000;
+                break;
+              case 0x268:
+                regNum = MISCREG_MTRR_FIX_4K_C0000;
+                break;
+              case 0x269:
+                regNum = MISCREG_MTRR_FIX_4K_C8000;
+                break;
+              case 0x26A:
+                regNum = MISCREG_MTRR_FIX_4K_D0000;
+                break;
+              case 0x26B:
+                regNum = MISCREG_MTRR_FIX_4K_D8000;
+                break;
+              case 0x26C:
+                regNum = MISCREG_MTRR_FIX_4K_E0000;
+                break;
+              case 0x26D:
+                regNum = MISCREG_MTRR_FIX_4K_E8000;
+                break;
+              case 0x26E:
+                regNum = MISCREG_MTRR_FIX_4K_F0000;
+                break;
+              case 0x26F:
+                regNum = MISCREG_MTRR_FIX_4K_F8000;
+                break;
+              case 0x277:
+                regNum = MISCREG_PAT;
+                break;
+              case 0x2FF:
+                regNum = MISCREG_DEF_TYPE;
+                break;
+              case 0x400:
+                regNum = MISCREG_MC0_CTL;
+                break;
+              case 0x404:
+                regNum = MISCREG_MC1_CTL;
+                break;
+              case 0x408:
+                regNum = MISCREG_MC2_CTL;
+                break;
+              case 0x40C:
+                regNum = MISCREG_MC3_CTL;
+                break;
+              case 0x410:
+                regNum = MISCREG_MC4_CTL;
+                break;
+              case 0x414:
+                regNum = MISCREG_MC5_CTL;
+                break;
+              case 0x418:
+                regNum = MISCREG_MC6_CTL;
+                break;
+              case 0x41C:
+                regNum = MISCREG_MC7_CTL;
+                break;
+              case 0x401:
+                regNum = MISCREG_MC0_STATUS;
+                break;
+              case 0x405:
+                regNum = MISCREG_MC1_STATUS;
+                break;
+              case 0x409:
+                regNum = MISCREG_MC2_STATUS;
+                break;
+              case 0x40D:
+                regNum = MISCREG_MC3_STATUS;
+                break;
+              case 0x411:
+                regNum = MISCREG_MC4_STATUS;
+                break;
+              case 0x415:
+                regNum = MISCREG_MC5_STATUS;
+                break;
+              case 0x419:
+                regNum = MISCREG_MC6_STATUS;
+                break;
+              case 0x41D:
+                regNum = MISCREG_MC7_STATUS;
+                break;
+              case 0x402:
+                regNum = MISCREG_MC0_ADDR;
+                break;
+              case 0x406:
+                regNum = MISCREG_MC1_ADDR;
+                break;
+              case 0x40A:
+                regNum = MISCREG_MC2_ADDR;
+                break;
+              case 0x40E:
+                regNum = MISCREG_MC3_ADDR;
+                break;
+              case 0x412:
+                regNum = MISCREG_MC4_ADDR;
+                break;
+              case 0x416:
+                regNum = MISCREG_MC5_ADDR;
+                break;
+              case 0x41A:
+                regNum = MISCREG_MC6_ADDR;
+                break;
+              case 0x41E:
+                regNum = MISCREG_MC7_ADDR;
+                break;
+              case 0x403:
+                regNum = MISCREG_MC0_MISC;
+                break;
+              case 0x407:
+                regNum = MISCREG_MC1_MISC;
+                break;
+              case 0x40B:
+                regNum = MISCREG_MC2_MISC;
+                break;
+              case 0x40F:
+                regNum = MISCREG_MC3_MISC;
+                break;
+              case 0x413:
+                regNum = MISCREG_MC4_MISC;
+                break;
+              case 0x417:
+                regNum = MISCREG_MC5_MISC;
+                break;
+              case 0x41B:
+                regNum = MISCREG_MC6_MISC;
+                break;
+              case 0x41F:
+                regNum = MISCREG_MC7_MISC;
+                break;
+              case 0xC0000080:
+                regNum = MISCREG_EFER;
+                break;
+              case 0xC0000081:
+                regNum = MISCREG_STAR;
+                break;
+              case 0xC0000082:
+                regNum = MISCREG_LSTAR;
+                break;
+              case 0xC0000083:
+                regNum = MISCREG_CSTAR;
+                break;
+              case 0xC0000084:
+                regNum = MISCREG_SF_MASK;
+                break;
+              case 0xC0000100:
+                regNum = MISCREG_FS_BASE;
+                break;
+              case 0xC0000101:
+                regNum = MISCREG_GS_BASE;
+                break;
+              case 0xC0000102:
+                regNum = MISCREG_KERNEL_GS_BASE;
+                break;
+              case 0xC0000103:
+                regNum = MISCREG_TSC_AUX;
+                break;
+              case 0xC0010000:
+                regNum = MISCREG_PERF_EVT_SEL0;
+                break;
+              case 0xC0010001:
+                regNum = MISCREG_PERF_EVT_SEL1;
+                break;
+              case 0xC0010002:
+                regNum = MISCREG_PERF_EVT_SEL2;
+                break;
+              case 0xC0010003:
+                regNum = MISCREG_PERF_EVT_SEL3;
+                break;
+              case 0xC0010004:
+                regNum = MISCREG_PERF_EVT_CTR0;
+                break;
+              case 0xC0010005:
+                regNum = MISCREG_PERF_EVT_CTR1;
+                break;
+              case 0xC0010006:
+                regNum = MISCREG_PERF_EVT_CTR2;
+                break;
+              case 0xC0010007:
+                regNum = MISCREG_PERF_EVT_CTR3;
+                break;
+              case 0xC0010010:
+                regNum = MISCREG_SYSCFG;
+                break;
+              case 0xC0010016:
+                regNum = MISCREG_IORR_BASE0;
+                break;
+              case 0xC0010017:
+                regNum = MISCREG_IORR_BASE1;
+                break;
+              case 0xC0010018:
+                regNum = MISCREG_IORR_MASK0;
+                break;
+              case 0xC0010019:
+                regNum = MISCREG_IORR_MASK1;
+                break;
+              case 0xC001001A:
+                regNum = MISCREG_TOP_MEM;
+                break;
+              case 0xC001001D:
+                regNum = MISCREG_TOP_MEM2;
+                break;
+              case 0xC0010114:
+                regNum = MISCREG_VM_CR;
+                break;
+              case 0xC0010115:
+                regNum = MISCREG_IGNNE;
+                break;
+              case 0xC0010116:
+                regNum = MISCREG_SMM_CTL;
+                break;
+              case 0xC0010117:
+                regNum = MISCREG_VM_HSAVE_PA;
+                break;
+              default:
+                return std::make_shared<GeneralProtection>(0);
+            }
+            //The index is multiplied by the size of a MiscReg so that
+            //any memory dependence calculations will not see these as
+            //overlapping.
+            req->setPaddr(regNum * sizeof(MiscReg));
+            return NoFault;
+        } else if (prefix == IntAddrPrefixIO) {
+            // TODO If CPL > IOPL or in virtual mode, check the I/O permission
+            // bitmap in the TSS.
+
+            Addr IOPort = vaddr & ~IntAddrPrefixMask;
+            // Make sure the address fits in the expected 16 bit IO address
+            // space.
+            assert(!(IOPort & ~0xFFFF));
+
+            if (IOPort == 0xCF8 && req->getSize() == 4) {
+                req->setFlags(Request::MMAPPED_IPR);
+                req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
+            } else if ((IOPort & ~mask(2)) == 0xCFC) {
+                req->setFlags(Request::UNCACHEABLE);
+
+                Addr configAddress =
+                    tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
+
+                if (bits(configAddress, 31, 31)) {
+                    req->setPaddr(PhysAddrPrefixPciConfig |
+                                  mbits(configAddress, 30, 2) |
+                                  (IOPort & mask(2)));
+                } else {
+                    req->setPaddr(PhysAddrPrefixIO | IOPort);
+                }
+            } else {
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(PhysAddrPrefixIO | IOPort);
+            }
+            return NoFault;
+        } else {
+            panic("Access to unrecognized internal address space %#x.\n",
+                  prefix);
+        }
+    }
+
+    /**
+     * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
+     * and false on a TLB miss.
+     * Many of the checks about different modes have been converted to
+     * assertions, since these parts of the code are not really used.
+     * On a hit it will update the LRU stack.
+     */
+    bool
+    GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
+    {
+        bool tlb_hit = false;
+    #ifndef NDEBUG
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+    #endif
+
+        assert(seg != SEGMENT_REG_MS);
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // make sure we are in 64-bit mode
+            assert(m5Reg.mode == LongMode);
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                //update LRU stack on a hit
+                GpuTlbEntry *entry = lookup(vaddr, true);
+
+                if (entry)
+                    tlb_hit = true;
+
+                if (!update_stats) {
+                    // functional tlb access for memory initialization
+                    // i.e., memory seeding or instr. seeding -> don't update
+                    // TLB and stats
+                    return tlb_hit;
+                }
+
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                } else {
+                    localNumTLBHits++;
+                }
+            }
+        }
+
+        return tlb_hit;
+    }
+
+    Fault
+    GpuTLB::translate(RequestPtr req, ThreadContext *tc,
+                      Translation *translation, Mode mode,
+                      bool &delayedResponse, bool timing, int &latency)
+    {
+        uint32_t flags = req->getFlags();
+        int seg = flags & SegmentFlagMask;
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // If this is true, we're dealing with a request
+        // to a non-memory address space.
+        if (seg == SEGMENT_REG_MS) {
+            return translateInt(req, tc);
+        }
+
+        delayedResponse = false;
+        Addr vaddr = req->getVaddr();
+        DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
+
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+        // If protected mode has been enabled...
+        if (m5Reg.prot) {
+            DPRINTF(GPUTLB, "In protected mode.\n");
+            // If we're not in 64-bit mode, do protection/limit checks
+            if (m5Reg.mode != LongMode) {
+                DPRINTF(GPUTLB, "Not in long mode. Checking segment "
+                        "protection.\n");
+
+                // Check for a null segment selector.
+                if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
+                    seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
+                    && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
+                    return std::make_shared<GeneralProtection>(0);
+                }
+
+                bool expandDown = false;
+                SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
+
+                if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
+                    if (!attr.writable && (mode == BaseTLB::Write ||
+                        storeCheck))
+                        return std::make_shared<GeneralProtection>(0);
+
+                    if (!attr.readable && mode == BaseTLB::Read)
+                        return std::make_shared<GeneralProtection>(0);
+
+                    expandDown = attr.expandDown;
+
+                }
+
+                Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
+                Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
+                // This assumes we're not in 64 bit mode. If we were, the
+                // default address size is 64 bits, overridable to 32.
+                int size = 32;
+                bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
+                SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
+
+                if ((csAttr.defaultSize && sizeOverride) ||
+                    (!csAttr.defaultSize && !sizeOverride)) {
+                    size = 16;
+                }
+
+                Addr offset = bits(vaddr - base, size - 1, 0);
+                Addr endOffset = offset + req->getSize() - 1;
+
+                if (expandDown) {
+                    DPRINTF(GPUTLB, "Checking an expand down segment.\n");
+                    warn_once("Expand down segments are untested.\n");
+
+                    if (offset <= limit || endOffset <= limit)
+                        return std::make_shared<GeneralProtection>(0);
+                } else {
+                    if (offset > limit || endOffset > limit)
+                        return std::make_shared<GeneralProtection>(0);
+                }
+            }
+
+            // If paging is enabled, do the translation.
+            if (m5Reg.paging) {
+                DPRINTF(GPUTLB, "Paging enabled.\n");
+                // The vaddr already has the segment base applied.
+                GpuTlbEntry *entry = lookup(vaddr);
+                localNumTLBAccesses++;
+
+                if (!entry) {
+                    localNumTLBMisses++;
+                    if (timing) {
+                        latency = missLatency1;
+                    }
+
+                    if (FullSystem) {
+                        fatal("GpuTLB doesn't support full-system mode\n");
+                    } else {
+                        DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
+                                "at pc %#x.\n", vaddr, tc->instAddr());
+
+                        Process *p = tc->getProcessPtr();
+                        GpuTlbEntry newEntry;
+                        bool success = p->pTable->lookup(vaddr, newEntry);
+
+                        if (!success && mode != BaseTLB::Execute) {
+                            // penalize a "page fault" more
+                            if (timing) {
+                                latency += missLatency2;
+                            }
+
+                            if (p->fixupStackFault(vaddr))
+                                success = p->pTable->lookup(vaddr, newEntry);
+                        }
+
+                        if (!success) {
+                            return std::make_shared<PageFault>(vaddr, true,
+                                                               mode, true,
+                                                               false);
+                        } else {
+                            newEntry.valid = success;
+                            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+
+                            DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
+                                    alignedVaddr, newEntry.pageStart());
+
+                            entry = insert(alignedVaddr, newEntry);
+                        }
+
+                        DPRINTF(GPUTLB, "Miss was serviced.\n");
+                    }
+                } else {
+                    localNumTLBHits++;
+
+                    if (timing) {
+                        latency = hitLatency;
+                    }
+                }
+
+                // Do paging protection checks.
+                bool inUser = (m5Reg.cpl == 3 &&
+                               !(flags & (CPL0FlagBit << FlagShift)));
+
+                CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+                bool badWrite = (!entry->writable && (inUser || cr0.wp));
+
+                if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
+                     badWrite)) {
+                    // The page must have been present to get into the TLB in
+                    // the first place. We'll assume the reserved bits are
+                    // fine even though we're not checking them.
+                    return std::make_shared<PageFault>(vaddr, true, mode,
+                                                       inUser, false);
+                }
+
+                if (storeCheck && badWrite) {
+                    // This would fault if this were a write, so return a page
+                    // fault that reflects that happening.
+                    return std::make_shared<PageFault>(vaddr, true,
+                                                       BaseTLB::Write,
+                                                       inUser, false);
+                }
+
+
+                DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
+                        "checks.\n", entry->paddr);
+
+                int page_size = entry->size();
+                Addr paddr = entry->paddr | (vaddr & (page_size - 1));
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+                req->setPaddr(paddr);
+
+                if (entry->uncacheable)
+                    req->setFlags(Request::UNCACHEABLE);
+            } else {
+                //Use the address which already has segmentation applied.
+                DPRINTF(GPUTLB, "Paging disabled.\n");
+                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+                req->setPaddr(vaddr);
+            }
+        } else {
+            // Real mode
+            DPRINTF(GPUTLB, "In real mode.\n");
+            DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+            req->setPaddr(vaddr);
+        }
+
+        // Check for an access to the local APIC
+        if (FullSystem) {
+            LocalApicBase localApicBase =
+                tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
+
+            Addr baseAddr = localApicBase.base * PageBytes;
+            Addr paddr = req->getPaddr();
+
+            if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
+                // Force the access to be uncacheable.
+                req->setFlags(Request::UNCACHEABLE);
+                req->setPaddr(x86LocalAPICAddress(tc->contextId(),
+                                                  paddr - baseAddr));
+            }
+        }
+
+        return NoFault;
+    };
+
+    Fault
+    GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                            int &latency)
+    {
+        bool delayedResponse;
+
+        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
+                                 latency);
+    }
+
+    void
+    GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
+            Translation *translation, Mode mode, int &latency)
+    {
+        bool delayedResponse;
+        assert(translation);
+
+        Fault fault = GpuTLB::translate(req, tc, translation, mode,
+                                        delayedResponse, true, latency);
+
+        if (!delayedResponse)
+            translation->finish(fault, req, tc, mode);
+    }
+
+    Walker*
+    GpuTLB::getWalker()
+    {
+        return walker;
+    }
+
+
+    void
+    GpuTLB::serialize(CheckpointOut &cp) const
+    {
+    }
+
+    void
+    GpuTLB::unserialize(CheckpointIn &cp)
+    {
+    }
+
+    void
+    GpuTLB::regStats()
+    {
+        localNumTLBAccesses
+            .name(name() + ".local_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        localNumTLBHits
+            .name(name() + ".local_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        localNumTLBMisses
+            .name(name() + ".local_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        localTLBMissRate
+            .name(name() + ".local_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        accessCycles
+            .name(name() + ".access_cycles")
+            .desc("Cycles spent accessing this TLB level")
+            ;
+
+        pageTableCycles
+            .name(name() + ".page_table_cycles")
+            .desc("Cycles spent accessing the page table")
+            ;
+
+        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+
+        numUniquePages
+            .name(name() + ".unique_pages")
+            .desc("Number of unique pages touched")
+            ;
+
+        localCycles
+            .name(name() + ".local_cycles")
+            .desc("Number of cycles spent in queue for all incoming reqs")
+            ;
+
+        localLatency
+            .name(name() + ".local_latency")
+            .desc("Avg. latency over incoming coalesced reqs")
+            ;
+
+        localLatency = localCycles / localNumTLBAccesses;
+
+        globalNumTLBAccesses
+            .name(name() + ".global_TLB_accesses")
+            .desc("Number of TLB accesses")
+            ;
+
+        globalNumTLBHits
+            .name(name() + ".global_TLB_hits")
+            .desc("Number of TLB hits")
+            ;
+
+        globalNumTLBMisses
+            .name(name() + ".global_TLB_misses")
+            .desc("Number of TLB misses")
+            ;
+
+        globalTLBMissRate
+            .name(name() + ".global_TLB_miss_rate")
+            .desc("TLB miss rate")
+            ;
+
+        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+
+        avgReuseDistance
+            .name(name() + ".avg_reuse_distance")
+            .desc("avg. reuse distance over all pages (in ticks)")
+            ;
+
+    }
+
+    /**
+     * Do the TLB lookup for this coalesced request and schedule
+     * another event <TLB access latency> cycles later.
+     */
+
+    void
+    GpuTLB::issueTLBLookup(PacketPtr pkt)
+    {
+        assert(pkt);
+        assert(pkt->senderState);
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+        bool update_stats = !sender_state->prefetch;
+        ThreadContext * tmp_tc = sender_state->tc;
+
+        DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
+                virt_page_addr);
+
+        int req_cnt = sender_state->reqCnt.back();
+
+        if (update_stats) {
+            accessCycles -= (curTick() * req_cnt);
+            localCycles -= curTick();
+            updatePageFootprint(virt_page_addr);
+            globalNumTLBAccesses += req_cnt;
+        }
+
+        tlbOutcome lookup_outcome = TLB_MISS;
+        RequestPtr tmp_req = pkt->req;
+
+        // Access the TLB and figure out if it's a hit or a miss.
+        bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
+
+        if (success) {
+            lookup_outcome = TLB_HIT;
+            // Put the entry in SenderState
+            GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+
+            if (update_stats) {
+                // the reqCnt has an entry per level, so its size tells us
+                // which level we are in
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                globalNumTLBHits += req_cnt;
+            }
+        } else {
+            if (update_stats)
+                globalNumTLBMisses += req_cnt;
+        }
+
+        /*
+         * We now know the TLB lookup outcome (if it's a hit or a miss), as well
+         * as the TLB access latency.
+         *
+         * We create and schedule a new TLBEvent which will help us take the
+         * appropriate actions (e.g., update TLB on a hit, send request to lower
+         * level TLB on a miss, or start a page walk if this was the last-level
+         * TLB)
+         */
+        TLBEvent *tlb_event =
+            new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
+
+        if (translationReturnEvent.count(virt_page_addr)) {
+            panic("Virtual Page Address %#x already has a return event\n",
+                  virt_page_addr);
+        }
+
+        translationReturnEvent[virt_page_addr] = tlb_event;
+        assert(tlb_event);
+
+        DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
+                curTick() + this->ticks(hitLatency));
+
+        schedule(tlb_event, curTick() + this->ticks(hitLatency));
+    }
+
+    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
+                               PacketPtr _pkt)
+        : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+        outcome(tlb_outcome), pkt(_pkt)
+    {
+    }
+
+    /**
+     * Do Paging protection checks. If we encounter a page fault, then
+     * an assertion is fired.
+     */
+    void
+    GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+            GpuTlbEntry * tlb_entry, Mode mode)
+    {
+        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+        uint32_t flags = pkt->req->getFlags();
+        bool storeCheck = flags & (StoreCheck << FlagShift);
+
+        // Do paging protection checks.
+        bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+        CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+
+        bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
+
+        if ((inUser && !tlb_entry->user) ||
+            (mode == BaseTLB::Write && badWrite)) {
+           // The page must have been present to get into the TLB in
+           // the first place. We'll assume the reserved bits are
+           // fine even though we're not checking them.
+           assert(false);
+        }
+
+        if (storeCheck && badWrite) {
+           // This would fault if this were a write, so return a page
+           // fault that reflects that happening.
+           assert(false);
+        }
+    }
+
+    /**
+     * handleTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * The latter calls handelHit with TLB miss as tlbOutcome.
+     */
+    void
+    GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
+            PacketPtr pkt)
+    {
+
+        assert(pkt);
+        Addr vaddr = pkt->req->getVaddr();
+
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
+                    vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        /**
+         * At this point the packet carries an up-to-date tlbEntry pointer
+         * in its senderState.
+         * Next step is to do the paging protection checks.
+         */
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        pagingProtectionChecks(tc, pkt, local_entry, mode);
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        // Since this packet will be sent through the cpu side slave port,
+        // it must be converted to a response pkt if it is not one already
+        if (pkt->isRequest()) {
+            pkt->makeTimingResponse();
+        }
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable) {
+             pkt->req->setFlags(Request::UNCACHEABLE);
+        }
+
+        //send packet back to coalescer
+        cpuSidePort[0]->sendTimingResp(pkt);
+        //schedule cleanup event
+        cleanupQueue.push(virt_page_addr);
+
+        // schedule this only once per cycle.
+        // The check is required because we might have multiple translations
+        // returning the same cycle
+        // this is a maximum priority event and must be on the same cycle
+        // as the cleanup event in TLBCoalescer to avoid a race with
+        // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
+        if (!cleanupEvent.scheduled())
+            schedule(cleanupEvent, curTick());
+    }
+
+    /**
+     * Here we take the appropriate actions based on the result of the
+     * TLB lookup.
+     */
+    void
+    GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                              PacketPtr pkt)
+    {
+        DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
+
+        assert(translationReturnEvent[virtPageAddr]);
+        assert(pkt);
+
+        TranslationState *tmp_sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        int req_cnt = tmp_sender_state->reqCnt.back();
+        bool update_stats = !tmp_sender_state->prefetch;
+
+
+        if (outcome == TLB_HIT) {
+            handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
+
+            if (update_stats) {
+                accessCycles += (req_cnt * curTick());
+                localCycles += curTick();
+            }
+
+        } else if (outcome == TLB_MISS) {
+
+            DPRINTF(GPUTLB, "This is a TLB miss\n");
+            if (update_stats) {
+                accessCycles += (req_cnt*curTick());
+                localCycles += curTick();
+            }
+
+            if (hasMemSidePort) {
+                // the one cyle added here represent the delay from when we get
+                // the reply back till when we propagate it to the coalescer
+                // above.
+                if (update_stats) {
+                    accessCycles += (req_cnt * 1);
+                    localCycles += 1;
+                }
+
+                /**
+                 * There is a TLB below. Send the coalesced request.
+                 * We actually send the very first packet of all the
+                 * pending packets for this virtual page address.
+                 */
+                if (!memSidePort[0]->sendTimingReq(pkt)) {
+                    DPRINTF(GPUTLB, "Failed sending translation request to "
+                            "lower level TLB for addr %#x\n", virtPageAddr);
+
+                    memSidePort[0]->retries.push_back(pkt);
+                } else {
+                    DPRINTF(GPUTLB, "Sent translation request to lower level "
+                            "TLB for addr %#x\n", virtPageAddr);
+                }
+            } else {
+                //this is the last level TLB. Start a page walk
+                DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
+                        "addr %#x\n", virtPageAddr);
+
+                if (update_stats)
+                    pageTableCycles -= (req_cnt*curTick());
+
+                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
+                assert(tlb_event);
+                tlb_event->updateOutcome(PAGE_WALK);
+                schedule(tlb_event, curTick() + ticks(missLatency2));
+            }
+        } else if (outcome == PAGE_WALK) {
+            if (update_stats)
+                pageTableCycles += (req_cnt*curTick());
+
+            // Need to access the page table and update the TLB
+            DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                    virtPageAddr);
+
+            TranslationState *sender_state =
+                safe_cast<TranslationState*>(pkt->senderState);
+
+            Process *p = sender_state->tc->getProcessPtr();
+            TlbEntry newEntry;
+            Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+            Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+            assert(alignedVaddr == virtPageAddr);
+    #endif
+            bool success;
+            success = p->pTable->lookup(vaddr, newEntry);
+            if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                if (p->fixupStackFault(vaddr)) {
+                    success = p->pTable->lookup(vaddr, newEntry);
+                }
+            }
+
+            DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                    newEntry.pageStart());
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success);
+
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else if (outcome == MISS_RETURN) {
+            /** we add an extra cycle in the return path of the translation
+             * requests in between the various TLB levels.
+             */
+            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+        } else {
+            assert(false);
+        }
+    }
+
+    void
+    GpuTLB::TLBEvent::process()
+    {
+        tlb->translationReturn(virtPageAddr, outcome, pkt);
+    }
+
+    const char*
+    GpuTLB::TLBEvent::description() const
+    {
+        return "trigger translationDoneEvent";
+    }
+
+    void
+    GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
+    {
+        outcome = _outcome;
+    }
+
+    Addr
+    GpuTLB::TLBEvent::getTLBEventVaddr()
+    {
+        return virtPageAddr;
+    }
+
+    /*
+     * recvTiming receives a coalesced timing request from a TLBCoalescer
+     * and it calls issueTLBLookup()
+     * It only rejects the packet if we have exceeded the max
+     * outstanding number of requests for the TLB
+     */
+    bool
+    GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
+    {
+        if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
+            tlb->issueTLBLookup(pkt);
+            // update number of outstanding translation requests
+            tlb->outstandingReqs++;
+            return true;
+         } else {
+            DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
+                    tlb->outstandingReqs);
+            return false;
+         }
+    }
+
+    /**
+     * handleFuncTranslationReturn is called on a TLB hit,
+     * when a TLB miss returns or when a page fault returns.
+     * It updates LRU, inserts the TLB entry on a miss
+     * depending on the allocation policy and does the required
+     * protection checks. It does NOT create a new packet to
+     * update the packet's addr; this is done in hsail-gpu code.
+     */
+    void
+    GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        Mode mode = sender_state->tlbMode;
+        Addr vaddr = pkt->req->getVaddr();
+
+        GpuTlbEntry *local_entry, *new_entry;
+
+        if (tlb_outcome == TLB_HIT) {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
+                    "%#x\n", vaddr);
+
+            local_entry = sender_state->tlbEntry;
+        } else {
+            DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
+                    "%#x\n", vaddr);
+
+            // We are returning either from a page walk or from a hit at a lower
+            // TLB level. The senderState should be "carrying" a pointer to the
+            // correct TLBEntry.
+            new_entry = sender_state->tlbEntry;
+            assert(new_entry);
+            local_entry = new_entry;
+
+            if (allocationPolicy) {
+                Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
+
+                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+                        virt_page_addr);
+
+                local_entry = insert(virt_page_addr, *new_entry);
+            }
+
+            assert(local_entry);
+        }
+
+        DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+                "while paddr was %#x.\n", local_entry->vaddr,
+                local_entry->paddr);
+
+        // Do paging checks if it's a normal functional access.  If it's for a
+        // prefetch, then sometimes you can try to prefetch something that won't
+        // pass protection. We don't actually want to fault becuase there is no
+        // demand access to deem this a violation.  Just put it in the TLB and
+        // it will fault if indeed a future demand access touches it in
+        // violation.
+        if (!sender_state->prefetch && sender_state->tlbEntry->valid)
+            pagingProtectionChecks(tc, pkt, local_entry, mode);
+
+        int page_size = local_entry->size();
+        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+        pkt->req->setPaddr(paddr);
+
+        if (local_entry->uncacheable)
+             pkt->req->setFlags(Request::UNCACHEABLE);
+    }
+
+    // This is used for atomic translations. Need to
+    // make it all happen during the same cycle.
+    void
+    GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
+    {
+        TranslationState *sender_state =
+            safe_cast<TranslationState*>(pkt->senderState);
+
+        ThreadContext *tc = sender_state->tc;
+        bool update_stats = !sender_state->prefetch;
+
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        if (update_stats)
+            tlb->updatePageFootprint(virt_page_addr);
+
+        // do the TLB lookup without updating the stats
+        bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
+        tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
+
+        // functional mode means no coalescing
+        // global metrics are the same as the local metrics
+        if (update_stats) {
+            tlb->globalNumTLBAccesses++;
+
+            if (success) {
+                sender_state->hitLevel = sender_state->reqCnt.size();
+                tlb->globalNumTLBHits++;
+            }
+        }
+
+        if (!success) {
+            if (update_stats)
+                tlb->globalNumTLBMisses++;
+            if (tlb->hasMemSidePort) {
+                // there is a TLB below -> propagate down the TLB hierarchy
+                tlb->memSidePort[0]->sendFunctional(pkt);
+                // If no valid translation from a prefetch, then just return
+                if (sender_state->prefetch && !pkt->req->hasPaddr())
+                    return;
+            } else {
+                // Need to access the page table and update the TLB
+                DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+                        virt_page_addr);
+
+                Process *p = tc->getProcessPtr();
+                TlbEntry newEntry;
+
+                Addr vaddr = pkt->req->getVaddr();
+    #ifndef NDEBUG
+                Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+                assert(alignedVaddr == virt_page_addr);
+    #endif
+
+                bool success = p->pTable->lookup(vaddr, newEntry);
+                if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+                    if (p->fixupStackFault(vaddr))
+                        success = p->pTable->lookup(vaddr, newEntry);
+                }
+
+                if (!sender_state->prefetch) {
+                    // no PageFaults are permitted after
+                    // the second page table lookup
+                    assert(success);
+
+                    DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                           newEntry.pageStart());
+
+                    sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr,
+                                                             newEntry.paddr,
+                                                             success);
+                } else {
+                    // If this was a prefetch, then do the normal thing if it
+                    // was a successful translation.  Otherwise, send an empty
+                    // TLB entry back so that it can be figured out as empty and
+                    // handled accordingly.
+                    if (success) {
+                        DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+                               newEntry.pageStart());
+
+                        sender_state->tlbEntry = new GpuTlbEntry(0,
+                                                                 newEntry.vaddr,
+                                                                 newEntry.paddr,
+                                                                 success);
+                    } else {
+                        DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
+                                alignedVaddr);
+
+                        sender_state->tlbEntry = new GpuTlbEntry();
+
+                        return;
+                    }
+                }
+            }
+        } else {
+            DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
+                    tlb->lookup(pkt->req->getVaddr()));
+
+            GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
+                                             update_stats);
+
+            assert(entry);
+
+            sender_state->tlbEntry =
+                new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+        }
+        // This is the function that would populate pkt->req with the paddr of
+        // the translation. But if no translation happens (i.e Prefetch fails)
+        // then the early returns in the above code wiill keep this function
+        // from executing.
+        tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
+    }
+
+    void
+    GpuTLB::CpuSidePort::recvReqRetry()
+    {
+        // The CPUSidePort never sends anything but replies. No retries
+        // expected.
+        assert(false);
+    }
+
+    AddrRangeList
+    GpuTLB::CpuSidePort::getAddrRanges() const
+    {
+        // currently not checked by the master
+        AddrRangeList ranges;
+
+        return ranges;
+    }
+
+    /**
+     * MemSidePort receives the packet back.
+     * We need to call the handleTranslationReturn
+     * and propagate up the hierarchy.
+     */
+    bool
+    GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
+    {
+        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+                                        TheISA::PageBytes);
+
+        DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
+                virt_page_addr);
+
+        TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
+        assert(tlb_event);
+        assert(virt_page_addr == tlb_event->getTLBEventVaddr());
+
+        tlb_event->updateOutcome(MISS_RETURN);
+        tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+
+        return true;
+    }
+
+    void
+    GpuTLB::MemSidePort::recvReqRetry()
+    {
+        // No retries should reach the TLB. The retries
+        // should only reach the TLBCoalescer.
+        assert(false);
+    }
+
+    void
+    GpuTLB::cleanup()
+    {
+        while (!cleanupQueue.empty()) {
+            Addr cleanup_addr = cleanupQueue.front();
+            cleanupQueue.pop();
+
+            // delete TLBEvent
+            TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
+            delete old_tlb_event;
+            translationReturnEvent.erase(cleanup_addr);
+
+            // update number of outstanding requests
+            outstandingReqs--;
+        }
+
+        /** the higher level coalescer should retry if it has
+         * any pending requests.
+         */
+        for (int i = 0; i < cpuSidePort.size(); ++i) {
+            cpuSidePort[i]->sendRetryReq();
+        }
+    }
+
+    void
+    GpuTLB::updatePageFootprint(Addr virt_page_addr)
+    {
+
+        std::pair<AccessPatternTable::iterator, bool> ret;
+
+        AccessInfo tmp_access_info;
+        tmp_access_info.lastTimeAccessed = 0;
+        tmp_access_info.accessesPerPage = 0;
+        tmp_access_info.totalReuseDistance = 0;
+        tmp_access_info.sumDistance = 0;
+        tmp_access_info.meanDistance = 0;
+
+        ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
+                                  tmp_access_info));
+
+        bool first_page_access = ret.second;
+
+        if (first_page_access) {
+            numUniquePages++;
+        } else  {
+            int accessed_before;
+            accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
+            ret.first->second.totalReuseDistance += accessed_before;
+        }
+
+        ret.first->second.accessesPerPage++;
+        ret.first->second.lastTimeAccessed = curTick();
+
+        if (accessDistance) {
+            ret.first->second.localTLBAccesses
+                .push_back(localNumTLBAccesses.value());
+        }
+    }
+
+    void
+    GpuTLB::exitCallback()
+    {
+        std::ostream *page_stat_file = nullptr;
+
+        if (accessDistance) {
+
+            // print per page statistics to a separate file (.csv format)
+            // simout is the gem5 output directory (default is m5out or the one
+            // specified with -d
+            page_stat_file = simout.create(name().c_str());
+
+            // print header
+            *page_stat_file << "page,max_access_distance,mean_access_distance, "
+                            << "stddev_distance" << std::endl;
+        }
+
+        // update avg. reuse distance footprint
+        AccessPatternTable::iterator iter, iter_begin, iter_end;
+        unsigned int sum_avg_reuse_distance_per_page = 0;
+
+        // iterate through all pages seen by this TLB
+        for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
+            sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
+                                               iter->second.accessesPerPage;
+
+            if (accessDistance) {
+                unsigned int tmp = iter->second.localTLBAccesses[0];
+                unsigned int prev = tmp;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    if (i) {
+                        tmp = prev + 1;
+                    }
+
+                    prev = iter->second.localTLBAccesses[i];
+                    // update the localTLBAccesses value
+                    // with the actual differece
+                    iter->second.localTLBAccesses[i] -= tmp;
+                    // compute the sum of AccessDistance per page
+                    // used later for mean
+                    iter->second.sumDistance +=
+                        iter->second.localTLBAccesses[i];
+                }
+
+                iter->second.meanDistance =
+                    iter->second.sumDistance / iter->second.accessesPerPage;
+
+                // compute std_dev and max  (we need a second round because we
+                // need to know the mean value
+                unsigned int max_distance = 0;
+                unsigned int stddev_distance = 0;
+
+                for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+                    unsigned int tmp_access_distance =
+                        iter->second.localTLBAccesses[i];
+
+                    if (tmp_access_distance > max_distance) {
+                        max_distance = tmp_access_distance;
+                    }
+
+                    unsigned int diff =
+                        tmp_access_distance - iter->second.meanDistance;
+                    stddev_distance += pow(diff, 2);
+
+                }
+
+                stddev_distance =
+                    sqrt(stddev_distance/iter->second.accessesPerPage);
+
+                if (page_stat_file) {
+                    *page_stat_file << std::hex << iter->first << ",";
+                    *page_stat_file << std::dec << max_distance << ",";
+                    *page_stat_file << std::dec << iter->second.meanDistance
+                                    << ",";
+                    *page_stat_file << std::dec << stddev_distance;
+                    *page_stat_file << std::endl;
+                }
+
+                // erase the localTLBAccesses array
+                iter->second.localTLBAccesses.clear();
+            }
+        }
+
+        if (!TLBFootprint.empty()) {
+            avgReuseDistance =
+                sum_avg_reuse_distance_per_page / TLBFootprint.size();
+        }
+
+        //clear the TLBFootprint map
+        TLBFootprint.clear();
+    }
+} // namespace X86ISA
+
+X86ISA::GpuTLB*
+X86GPUTLBParams::create()
+{
+    return new X86ISA::GpuTLB(this);
+}
+
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+    class GpuTlbEntry : public TlbEntry
+    {
+      public:
+        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+        GpuTlbEntry() : TlbEntry() { }
+
+        bool valid;
+    };
+
+    class GpuTLB : public MemObject
+    {
+      protected:
+        friend class Walker;
+
+        typedef std::list<GpuTlbEntry*> EntryList;
+
+        uint32_t configAddress;
+
+        // TLB clock: will inherit clock from shader's clock period in terms
+        // of nuber of ticks of curTime (aka global simulation clock)
+        // The assignment of TLB clock from shader clock is done in the python
+        // config files.
+        int clock;
+
+      public:
+        // clock related functions ; maps to-and-from Simulation ticks and
+        // object clocks.
+        Tick frequency() const { return SimClock::Frequency / clock; }
+
+        Tick
+        ticks(int numCycles) const
+        {
+            return (Tick)clock * numCycles;
+        }
+
+        Tick curCycle() const { return curTick() / clock; }
+        Tick tickToCycles(Tick val) const { return val / clock;}
+
+        typedef X86GPUTLBParams Params;
+        GpuTLB(const Params *p);
+        ~GpuTLB();
+
+        typedef enum BaseTLB::Mode Mode;
+
+        class Translation
+        {
+          public:
+            virtual ~Translation() { }
+
+            /**
+             * Signal that the translation has been delayed due to a hw page
+             * table walk.
+             */
+            virtual void markDelayed() = 0;
+
+            /**
+             * The memory for this object may be dynamically allocated, and it
+             * may be responsible for cleaning itslef up which will happen in
+             * this function. Once it's called the object is no longer valid.
+             */
+            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+                    Mode mode) = 0;
+        };
+
+        void dumpAll();
+        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+        void setConfigAddress(uint32_t addr);
+
+      protected:
+        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+        Walker *walker;
+
+      public:
+        Walker *getWalker();
+        void invalidateAll();
+        void invalidateNonGlobal();
+        void demapPage(Addr va, uint64_t asn);
+
+      protected:
+        int size;
+        int assoc;
+        int numSets;
+
+        /**
+         *  true if this is a fully-associative TLB
+         */
+        bool FA;
+        Addr setMask;
+
+        /**
+         * Allocation Policy: true if we always allocate on a hit, false
+         * otherwise. Default is true.
+         */
+        bool allocationPolicy;
+
+        /**
+         * if true, then this is not the last level TLB
+         */
+        bool hasMemSidePort;
+
+        /**
+         * Print out accessDistance stats. One stat file
+         * per TLB.
+         */
+        bool accessDistance;
+
+        GpuTlbEntry *tlb;
+
+        /*
+         * It's a per-set list. As long as we have not reached
+         * the full capacity of the given set, grab an entry from
+         * the freeList.
+         */
+        std::vector<EntryList> freeList;
+
+        /**
+         * An entryList per set is the equivalent of an LRU stack;
+         * it's used to guide replacement decisions. The head of the list
+         * contains the MRU TLB entry of the given set. If the freeList
+         * for this set is empty, the last element of the list
+         * is evicted (i.e., dropped on the floor).
+         */
+        std::vector<EntryList> entryList;
+
+        Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+        Fault translate(RequestPtr req, ThreadContext *tc,
+                Translation *translation, Mode mode, bool &delayedResponse,
+                bool timing, int &latency);
+
+      public:
+        // latencies for a TLB hit, miss and page fault
+        int hitLatency;
+        int missLatency1;
+        int missLatency2;
+
+        // local_stats are as seen from the TLB
+        // without taking into account coalescing
+        Stats::Scalar localNumTLBAccesses;
+        Stats::Scalar localNumTLBHits;
+        Stats::Scalar localNumTLBMisses;
+        Stats::Formula localTLBMissRate;
+
+        // global_stats are as seen from the
+        // CU's perspective taking into account
+        // all coalesced requests.
+        Stats::Scalar globalNumTLBAccesses;
+        Stats::Scalar globalNumTLBHits;
+        Stats::Scalar globalNumTLBMisses;
+        Stats::Formula globalTLBMissRate;
+
+        // from the CU perspective (global)
+        Stats::Scalar accessCycles;
+        // from the CU perspective (global)
+        Stats::Scalar pageTableCycles;
+        Stats::Scalar numUniquePages;
+        // from the perspective of this TLB
+        Stats::Scalar localCycles;
+        // from the perspective of this TLB
+        Stats::Formula localLatency;
+        // I take the avg. per page and then
+        // the avg. over all pages.
+        Stats::Scalar avgReuseDistance;
+
+        void regStats();
+        void updatePageFootprint(Addr virt_page_addr);
+        void printAccessPattern();
+
+
+        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+                              int &latency);
+
+        void translateTiming(RequestPtr req, ThreadContext *tc,
+                             Translation *translation, Mode mode,
+                             int &latency);
+
+        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+        // Checkpointing
+        virtual void serialize(CheckpointOut& cp) const;
+        virtual void unserialize(CheckpointIn& cp);
+        void issueTranslation();
+        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+                                     PacketPtr pkt);
+
+        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+                                    GpuTlbEntry *tlb_entry, Mode mode);
+
+        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+                                 Addr phys_page_addr);
+
+        void issueTLBLookup(PacketPtr pkt);
+
+        // CpuSidePort is the TLB Port closer to the CPU/CU side
+        class CpuSidePort : public SlavePort
+        {
+          public:
+            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingReq(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt);
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+            virtual void recvRespRetry() { assert(false); }
+            virtual AddrRangeList getAddrRanges() const;
+        };
+
+        /**
+         * MemSidePort is the TLB Port closer to the memory side
+         * If this is a last level TLB then this port will not be connected.
+         *
+         * Future action item: if we ever do real page walks, then this port
+         * should be connected to a RubyPort.
+         */
+        class MemSidePort : public MasterPort
+        {
+          public:
+            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+                        PortID _index)
+                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+            std::deque<PacketPtr> retries;
+
+          protected:
+            GpuTLB *tlb;
+            int index;
+
+            virtual bool recvTimingResp(PacketPtr pkt);
+            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+            virtual void recvFunctional(PacketPtr pkt) { }
+            virtual void recvRangeChange() { }
+            virtual void recvReqRetry();
+        };
+
+        // TLB ports on the cpu Side
+        std::vector<CpuSidePort*> cpuSidePort;
+        // TLB ports on the memory side
+        std::vector<MemSidePort*> memSidePort;
+
+        BaseMasterPort &getMasterPort(const std::string &if_name,
+                                      PortID idx=InvalidPortID);
+
+        BaseSlavePort &getSlavePort(const std::string &if_name,
+                                    PortID idx=InvalidPortID);
+
+        /**
+         * TLB TranslationState: this currently is a somewhat bastardization of
+         * the usage of SenderState, whereby the receiver of a packet is not
+         * usually supposed to need to look at the contents of the senderState,
+         * you're really only supposed to look at what you pushed on, pop it
+         * off, and send it back.
+         *
+         * However, since there is state that we want to pass to the TLBs using
+         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+         * arguments, we need a common TLB senderState to pass between TLBs,
+         * both "forwards" and "backwards."
+         *
+         * So, basically, the rule is that any packet received by a TLB port
+         * (cpuside OR memside) must be safely castable to a TranslationState.
+         */
+
+        struct TranslationState : public Packet::SenderState
+        {
+            // TLB mode, read or write
+            Mode tlbMode;
+            // Thread context associated with this req
+            ThreadContext *tc;
+
+            /*
+            * TLB entry to be populated and passed back and filled in
+            * previous TLBs.  Equivalent to the data cache concept of
+            * "data return."
+            */
+            GpuTlbEntry *tlbEntry;
+            // Is this a TLB prefetch request?
+            bool prefetch;
+            // When was the req for this translation issued
+            uint64_t issueTime;
+            // Remember where this came from
+            std::vector<SlavePort*>ports;
+
+            // keep track of #uncoalesced reqs per packet per TLB level;
+            // reqCnt per level >= reqCnt higher level
+            std::vector<int> reqCnt;
+            // TLB level this packet hit in; 0 if it hit in the page table
+            int hitLevel;
+            Packet::SenderState *saved;
+
+            TranslationState(Mode tlb_mode, ThreadContext *_tc,
+                             bool _prefetch=false,
+                             Packet::SenderState *_saved=nullptr)
+                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+                  prefetch(_prefetch), issueTime(0),
+                  hitLevel(0),saved(_saved) { }
+        };
+
+        // maximum number of permitted coalesced requests per cycle
+        int maxCoalescedReqs;
+
+        // Current number of outstandings coalesced requests.
+        // Should be <= maxCoalescedReqs
+        int outstandingReqs;
+
+        /**
+         * A TLBEvent is scheduled after the TLB lookup and helps us take the
+         * appropriate actions:
+         *  (e.g., update TLB on a hit,
+         *  send request to lower level TLB on a miss,
+         *  or start a page walk if this was the last-level TLB).
+         */
+        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+                               PacketPtr pkt);
+
+        class TLBEvent : public Event
+        {
+            private:
+                GpuTLB *tlb;
+                Addr virtPageAddr;
+                /**
+                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+                 */
+                tlbOutcome outcome;
+                PacketPtr pkt;
+
+            public:
+                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+                        PacketPtr _pkt);
+
+                void process();
+                const char *description() const;
+
+                // updateOutcome updates the tlbOutcome of a TLBEvent
+                void updateOutcome(tlbOutcome _outcome);
+                Addr getTLBEventVaddr();
+        };
+
+        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+        // this FIFO queue keeps track of the virt. page addresses
+        // that are pending cleanup
+        std::queue<Addr> cleanupQueue;
+
+        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+        // free memory and do the required clean-up
+        void cleanup();
+
+        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+        /**
+         * This hash map will use the virtual page address as a key
+         * and will keep track of total number of accesses per page
+         */
+
+        struct AccessInfo
+        {
+            unsigned int lastTimeAccessed; // last access to this page
+            unsigned int accessesPerPage;
+            // need to divide it by accessesPerPage at the end
+            unsigned int totalReuseDistance;
+
+            /**
+             * The field below will help us compute the access distance,
+             * that is the number of (coalesced) TLB accesses that
+             * happened in between each access to this page
+             *
+             * localTLBAccesses[x] is the value of localTLBNumAccesses
+             * when the page <Addr> was accessed for the <x>th time
+             */
+            std::vector<unsigned int> localTLBAccesses;
+            unsigned int sumDistance;
+            unsigned int meanDistance;
+        };
+
+        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+        AccessPatternTable TLBFootprint;
+
+        // Called at the end of simulation to dump page access stats.
+        void exitCallback();
+
+        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+    };
+}
+
+#endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh
new file mode 100644
index 000000000..9f358e23c
--- /dev/null
+++ b/src/gpu-compute/hsa_code.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_CODE_HH__
+#define __HSA_CODE_HH__
+
+#include <string>
+#include <vector>
+
+#include "arch/gpu_types.hh"
+#include "config/the_gpu_isa.hh"
+
+class HsaKernelInfo;
+
+/* @class HsaCode
+ * base code object for the set of HSA kernels associated
+ * with a single application. this class provides the common
+ * methods for creating, accessing, and storing information
+ * about kernel and variable symbols, symbol name, memory
+ * segment sizes, and instruction count, etc.
+ */
+
+class HsaCode
+{
+  public:
+    HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
+                                       _name(name)
+    {
+    }
+
+    enum class MemorySegment {
+        NONE,
+        FLAT,
+        GLOBAL,
+        READONLY,
+        KERNARG,
+        GROUP,
+        PRIVATE,
+        SPILL,
+        ARG,
+        EXTSPACE0
+    };
+
+    const std::string& name() const { return _name; }
+    int numInsts() const { return _insts.size(); }
+    std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
+
+    void
+    setReadonlyData(uint8_t *_readonly_data)
+    {
+        readonly_data = _readonly_data;
+    }
+
+    virtual int getSize(MemorySegment segment) const = 0;
+    virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
+
+    uint8_t *readonly_data;
+    int funcarg_size;
+
+  protected:
+    // An array that stores instruction indices (0 through kernel size)
+    // for a kernel passed to code object constructor as an argument.
+    std::vector<TheGpuISA::RawMachInst> _insts;
+
+  private:
+    const std::string _name;
+};
+
+#endif // __HSA_CODE_HH__
diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh
new file mode 100644
index 000000000..396913dac
--- /dev/null
+++ b/src/gpu-compute/hsa_kernel_info.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSA_KERNEL_INFO_HH__
+#define __HSA_KERNEL_INFO_HH__
+
+// This file defines the public interface between the HSA emulated
+// driver and application programs.
+
+#include <cstdint>
+
+static const int HSA_GET_SIZES = 0x4801;
+static const int HSA_GET_KINFO = 0x4802;
+static const int HSA_GET_STRINGS = 0x4803;
+static const int HSA_GET_CODE = 0x4804;
+static const int HSA_GET_READONLY_DATA = 0x4805;
+static const int HSA_GET_CU_CNT = 0x4806;
+static const int HSA_GET_VSZ = 0x4807;
+
+// Return value (via buffer ptr) for HSA_GET_SIZES
+struct HsaDriverSizes
+{
+    uint32_t num_kernels;
+    uint32_t string_table_size;
+    uint32_t code_size;
+    uint32_t readonly_size;
+};
+
+// HSA_GET_KINFO returns an array of num_kernels of these structs
+struct HsaKernelInfo
+{
+    // byte offset into string table
+    uint32_t name_offs;
+    // byte offset into code array
+    uint32_t code_offs;
+    uint32_t static_lds_size;
+    uint32_t private_mem_size;
+    uint32_t spill_mem_size;
+    // Number of s registers
+    uint32_t sRegCount;
+    // Number of d registers
+    uint32_t dRegCount;
+    // Number of c registers
+    uint32_t cRegCount;
+};
+
+#endif // __HSA_KERNEL_INFO_HH__
diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc
new file mode 100644
index 000000000..91dfb160e
--- /dev/null
+++ b/src/gpu-compute/hsa_object.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/hsa_object.hh"
+
+#include <fstream>
+
+#include "gpu-compute/brig_object.hh"
+
+HsaObject::HsaObject(const std::string &fname)
+    : readonlyData(nullptr), filename(fname)
+{
+}
+
+HsaObject*
+HsaObject::createHsaObject(const std::string &fname)
+{
+    HsaObject *hsaObj = nullptr;
+    uint8_t *file_data = nullptr;
+    int file_length = 0;
+
+    std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
+                            std::ifstream::binary);
+
+    assert(code_file.is_open());
+    assert(code_file.good());
+
+    file_length = code_file.tellg();
+    code_file.seekg(0, code_file.beg);
+    file_data = new uint8_t[file_length];
+    code_file.read((char*)file_data, file_length);
+    code_file.close();
+
+    for (const auto &tryFile : tryFileFuncs) {
+        if ((hsaObj = tryFile(fname, file_length, file_data))) {
+            return hsaObj;
+        }
+    }
+
+    delete[] file_data;
+    fatal("Unknown HSA object type for file: %s.\n", fname);
+
+    return nullptr;
+}
diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh
new file mode 100644
index 000000000..1f08f5d80
--- /dev/null
+++ b/src/gpu-compute/hsa_object.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_OBJECT_HH__
+#define __HSA_OBJECT_HH__
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class HsaCode;
+
+/* @class HsaObject
+ * base loader object for HSA kernels. this class provides
+ * the base method definitions for loading, storing, and
+ * accessing HSA kernel objects into the simulator.
+ */
+
+class HsaObject
+{
+  public:
+    HsaObject(const std::string &fileName);
+
+    static HsaObject* createHsaObject(const std::string &fname);
+    static std::vector<std::function<HsaObject*(const std::string&, int,
+                                                uint8_t*)>> tryFileFuncs;
+
+    virtual HsaCode* getKernel(const std::string &name) const = 0;
+    virtual HsaCode* getKernel(int i) const = 0;
+    virtual HsaCode* getFunction(const std::string &name) const = 0;
+    virtual int numKernels() const = 0;
+
+    const std::string& name() const { return filename; }
+
+    uint8_t *readonlyData;
+
+
+  protected:
+    const std::string filename;
+};
+
+#endif // __HSA_OBJECT_HH__
diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc
new file mode 100644
index 000000000..b0ddf0161
--- /dev/null
+++ b/src/gpu-compute/hsail_code.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/hsail_code.hh"
+
+#include "arch/gpu_types.hh"
+#include "arch/hsail/Brig.h"
+#include "arch/hsail/operand.hh"
+#include "config/the_gpu_isa.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/kernel_cfg.hh"
+
+using namespace Brig;
+
+int getBrigDataTypeBytes(BrigType16_t t);
+
+HsailCode::HsailCode(const std::string &name_str)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+}
+
+void
+HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
+                StorageMap *objStorageMap)
+{
+    storageMap = objStorageMap;
+
+    // set pointer so that decoding process can find this kernel context when
+    // needed
+    obj->currentCode = this;
+
+    if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
+        code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
+        fatal("unexpected directive kind %d inside kernel/function init\n",
+              code_dir->base.kind);
+    }
+
+    DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
+            code_dir->firstCodeBlockEntry);
+
+    // clear these static vars so we can properly track the max index
+    // for this kernel
+    SRegOperand::maxRegIdx = 0;
+    DRegOperand::maxRegIdx = 0;
+    CRegOperand::maxRegIdx = 0;
+    setPrivateSize(0);
+
+    const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
+    const BrigBase *endPtr =
+        obj->getCodeSectionEntry(code_dir->nextModuleEntry);
+
+    int inst_idx = 0;
+    std::vector<GPUStaticInst*> instructions;
+    int funcarg_size_scope = 0;
+
+    // walk through instructions in code section and directives in
+    // directive section in parallel, processing directives that apply
+    // when we reach the relevant code point.
+    while (entryPtr < endPtr) {
+        switch (entryPtr->kind) {
+          case BRIG_KIND_DIRECTIVE_VARIABLE:
+           {
+                const BrigDirectiveVariable *sym =
+                    (const BrigDirectiveVariable*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_variable, symbol is: %s\n",
+                        obj->getString(sym->name));
+
+                StorageElement *se = storageMap->addSymbol(sym, obj);
+
+                if (sym->segment == BRIG_SEGMENT_PRIVATE) {
+                    setPrivateSize(se->size);
+                } else { // spill
+                    funcarg_size_scope += se->size;
+                }
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_LABEL:
+            {
+                const BrigDirectiveLabel *lbl =
+                    (const BrigDirectiveLabel*)entryPtr;
+
+                DPRINTF(HSAILObject,"Initializing code, directive is "
+                        "kind_label, label is: %s \n",
+                        obj->getString(lbl->name));
+
+                labelMap.addLabel(lbl, inst_idx, obj);
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_PRAGMA:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive "
+                        "is kind_pragma\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_COMMENT:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_comment\n");
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_start\n");
+
+                storageMap->resetOffset(BRIG_SEGMENT_ARG);
+                funcarg_size_scope = 0;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+            {
+                DPRINTF(HSAILObject, "Initializing code, directive is "
+                        "kind_arg_block_end\n");
+
+                funcarg_size = funcarg_size < funcarg_size_scope ?
+                                              funcarg_size_scope : funcarg_size;
+            }
+            break;
+
+          case BRIG_KIND_DIRECTIVE_END:
+            DPRINTF(HSAILObject, "Initializing code, dircetive is "
+                    "kind_end\n");
+
+            break;
+
+          default:
+            if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
+                entryPtr->kind <= BRIG_KIND_INST_END) {
+
+                BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
+                TheGpuISA::MachInst machInst = { instPtr, obj };
+                GPUStaticInst *iptr = decoder.decode(machInst);
+
+                if (iptr) {
+                    DPRINTF(HSAILObject, "Initializing code, processing inst "
+                            "#%d idx %d: OPCODE=%d\n",
+                            inst_idx,  _insts.size(), instPtr->opcode);
+
+                    TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
+                    iptr->instNum(inst_idx);
+                    _insts.push_back(inst_num);
+                    instructions.push_back(iptr);
+                }
+                ++inst_idx;
+            } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+                       entryPtr->kind < BRIG_KIND_OPERAND_END) {
+                warn("unexpected operand entry in code segment\n");
+            } else {
+                // there are surely some more cases we will need to handle,
+                // but we'll deal with them as we find them.
+                fatal("unexpected directive kind %d inside kernel scope\n",
+                      entryPtr->kind);
+            }
+        }
+
+        entryPtr = brigNext(entryPtr);
+    }
+
+    // compute Control Flow Graph for current kernel
+    ControlFlowInfo::assignImmediatePostDominators(instructions);
+
+    max_sreg = SRegOperand::maxRegIdx;
+    max_dreg = DRegOperand::maxRegIdx;
+    max_creg = CRegOperand::maxRegIdx;
+
+    obj->currentCode = nullptr;
+}
+
+HsailCode::HsailCode(const std::string &name_str,
+                     const BrigDirectiveExecutable *code_dir,
+                     const BrigObject *obj, StorageMap *objStorageMap)
+    : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+    init(code_dir, obj, objStorageMap);
+}
+
+void
+LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
+                   const BrigObject *obj)
+{
+    std::string lbl_name = obj->getString(lblDir->name);
+    Label &lbl = map[lbl_name];
+
+    if (lbl.defined()) {
+        fatal("Attempt to redefine existing label %s\n", lbl_name);
+    }
+
+    lbl.define(lbl_name, inst_index);
+    DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
+}
+
+Label*
+LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
+                   const BrigObject *obj)
+{
+    std::string name = obj->getString(lblDir->name);
+    Label &lbl = map[name];
+    lbl.checkName(name);
+
+    return &lbl;
+}
+
+int
+getBrigDataTypeBytes(BrigType16_t t)
+{
+    switch (t) {
+      case BRIG_TYPE_S8:
+      case BRIG_TYPE_U8:
+      case BRIG_TYPE_B8:
+        return 1;
+
+      case BRIG_TYPE_S16:
+      case BRIG_TYPE_U16:
+      case BRIG_TYPE_B16:
+      case BRIG_TYPE_F16:
+        return 2;
+
+      case BRIG_TYPE_S32:
+      case BRIG_TYPE_U32:
+      case BRIG_TYPE_B32:
+      case BRIG_TYPE_F32:
+        return 4;
+
+      case BRIG_TYPE_S64:
+      case BRIG_TYPE_U64:
+      case BRIG_TYPE_B64:
+      case BRIG_TYPE_F64:
+        return 8;
+
+      case BRIG_TYPE_B1:
+
+      default:
+        fatal("unhandled symbol data type %d", t);
+        return 0;
+    }
+}
+
+StorageElement*
+StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
+                        const BrigObject *obj)
+{
+    const char *sym_name = obj->getString(sym->name);
+    uint64_t size = 0;
+    uint64_t offset = 0;
+
+    if (sym->type & BRIG_TYPE_ARRAY) {
+        size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
+        size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
+
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
+                         ~BRIG_TYPE_ARRAY));
+    } else {
+        size = getBrigDataTypeBytes(sym->type);
+        offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
+    }
+
+    nextOffset = offset + size;
+
+    DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
+            segmentNames[segment], sym_name, size, offset, sym->init);
+
+    StorageElement* se = new StorageElement(sym_name, offset, size, sym);
+    elements.push_back(se);
+    elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
+    elements_by_brigptr[sym] = se;
+
+    return se;
+}
+
+StorageElement*
+StorageSpace::findSymbol(std::string name)
+{
+    for (auto it : elements) {
+        if (it->name == name) {
+            return it;
+        }
+    }
+
+    return nullptr;
+}
+
+StorageElement*
+StorageSpace::findSymbol(uint64_t addr)
+{
+    assert(elements_by_addr.size() > 0);
+
+    auto se = elements_by_addr.find(addr);
+
+    if (se == elements_by_addr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageElement*
+StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
+{
+    assert(elements_by_brigptr.size() > 0);
+
+    auto se = elements_by_brigptr.find(brigptr);
+
+    if (se == elements_by_brigptr.end()) {
+        return nullptr;
+    } else {
+        return se->second;
+    }
+}
+
+StorageMap::StorageMap(StorageMap *outerScope)
+    : outerScopeMap(outerScope)
+{
+    for (int i = 0; i < NumSegments; ++i)
+        space[i] = new StorageSpace((BrigSegment)i);
+}
+
+StorageElement*
+StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
+{
+    BrigSegment8_t segment = sym->segment;
+
+    assert(segment >= Brig::BRIG_SEGMENT_FLAT);
+    assert(segment < NumSegments);
+
+    return space[segment]->addSymbol(sym, obj);
+}
+
+int
+StorageMap::getSize(Brig::BrigSegment segment)
+{
+    assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
+    assert(segment < NumSegments);
+
+    if (segment != Brig::BRIG_SEGMENT_GROUP &&
+        segment != Brig::BRIG_SEGMENT_READONLY) {
+        return space[segment]->getSize();
+    } else {
+        int ret = space[segment]->getSize();
+
+        if (outerScopeMap) {
+            ret += outerScopeMap->getSize(segment);
+        }
+
+        return ret;
+    }
+}
+
+void
+StorageMap::resetOffset(Brig::BrigSegment segment)
+{
+    space[segment]->resetOffset();
+}
+
+StorageElement*
+StorageMap::findSymbol(BrigSegment segment, std::string name)
+{
+    StorageElement *se = space[segment]->findSymbol(name);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, name);
+
+    return nullptr;
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(addr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, addr);
+
+    return nullptr;
+
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment,
+                       const BrigDirectiveVariable *brigptr)
+{
+    StorageSpace *sp = space[segment];
+
+    if (!sp) {
+        // there is no memory in segment?
+        return nullptr;
+    }
+
+    StorageElement *se = sp->findSymbol(brigptr);
+
+    if (se)
+        return se;
+
+    if (outerScopeMap)
+        return outerScopeMap->findSymbol(segment, brigptr);
+
+    return nullptr;
+
+}
diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh
new file mode 100644
index 000000000..d9fbcc577
--- /dev/null
+++ b/src/gpu-compute/hsail_code.hh
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSAIL_CODE_HH__
+#define __HSAIL_CODE_HH__
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "arch/hsail/Brig.h"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/misc.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+inline int
+popcount(uint64_t src, int sz)
+{
+    int cnt = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            ++cnt;
+        src >>= 1;
+    }
+
+    return cnt;
+}
+
+inline int
+firstbit(uint64_t src, int sz)
+{
+    int i;
+
+    for (i = 0; i < sz; ++i) {
+        if (src & 1)
+            break;
+        src >>= 1;
+    }
+
+    return i;
+}
+
+inline int
+lastbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    for (int i = 0; i < sz; ++i) {
+        if (src & 1)
+            i0 = i;
+        src >>= 1;
+    }
+
+    return i0;
+}
+
+inline int
+signbit(uint64_t src, int sz)
+{
+    int i0 = -1;
+
+    if (src & (1 << (sz - 1))) {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (!(src & 1))
+                i0 = i;
+            src >>= 1;
+        }
+    } else {
+        for (int i = 0; i < sz - 1; ++i) {
+            if (src & 1)
+                i0 = i;
+            src >>= 1;
+        }
+    }
+
+    return i0;
+}
+
+inline uint64_t
+bitrev(uint64_t src, int sz)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < sz; ++i) {
+        r <<= 1;
+        if (src & 1)
+            r |= 1;
+        src >>= 1;
+    }
+
+    return r;
+}
+
+inline uint64_t
+mul_hi(uint32_t a, uint32_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int32_t a, int32_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(uint64_t a, uint64_t b)
+{
+    return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int64_t a, int64_t b)
+{
+    return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(double a, double b)
+{
+    return 0;
+}
+
+class Label
+{
+  public:
+    std::string name;
+    int value;
+
+    Label() : value(-1)
+    {
+    }
+
+    bool defined() { return value != -1; }
+
+    void
+    checkName(std::string &_name)
+    {
+        if (name.empty()) {
+            name = _name;
+        } else {
+            assert(name == _name);
+        }
+    }
+
+    void
+    define(std::string &_name, int _value)
+    {
+        assert(!defined());
+        assert(_value != -1);
+        value = _value;
+        checkName(_name);
+    }
+
+    int
+    get()
+    {
+        assert(defined());
+        return value;
+    }
+};
+
+class LabelMap
+{
+    std::map<std::string, Label> map;
+
+  public:
+    LabelMap() { }
+
+    void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
+                  const BrigObject *obj);
+
+    Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
+                    const BrigObject *obj);
+};
+
+const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
+
+extern const char *segmentNames[];
+
+class StorageElement
+{
+  public:
+    std::string name;
+    uint64_t offset;
+
+    uint64_t size;
+    const Brig::BrigDirectiveVariable *brigSymbol;
+    StorageElement(const char *_name, uint64_t _offset, int _size,
+                   const Brig::BrigDirectiveVariable *sym)
+        : name(_name), offset(_offset), size(_size), brigSymbol(sym)
+    {
+    }
+};
+
+class StorageSpace
+{
+    typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
+            DirVarToSE_map;
+
+    std::list<StorageElement*> elements;
+    AddrRangeMap<StorageElement*> elements_by_addr;
+    DirVarToSE_map elements_by_brigptr;
+
+    uint64_t nextOffset;
+    Brig::BrigSegment segment;
+
+  public:
+    StorageSpace(Brig::BrigSegment _class)
+        : nextOffset(0), segment(_class)
+    {
+    }
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(std::string name);
+    StorageElement* findSymbol(uint64_t addr);
+    StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
+
+    int getSize() { return nextOffset; }
+    void resetOffset() { nextOffset = 0; }
+};
+
+class StorageMap
+{
+    StorageMap *outerScopeMap;
+    StorageSpace *space[NumSegments];
+
+  public:
+    StorageMap(StorageMap *outerScope = nullptr);
+
+    StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+                              const BrigObject *obj);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
+    StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
+
+    StorageElement* findSymbol(Brig::BrigSegment segment,
+                               const Brig::BrigDirectiveVariable *brigptr);
+
+    // overloaded version to avoid casting
+    StorageElement*
+    findSymbol(Brig::BrigSegment8_t segment, std::string name)
+    {
+        return findSymbol((Brig::BrigSegment)segment, name);
+    }
+
+    int getSize(Brig::BrigSegment segment);
+    void resetOffset(Brig::BrigSegment segment);
+};
+
+typedef enum
+{
+    BT_DEFAULT,
+    BT_B8,
+    BT_U8,
+    BT_U16,
+    BT_U32,
+    BT_U64,
+    BT_S8,
+    BT_S16,
+    BT_S32,
+    BT_S64,
+    BT_F16,
+    BT_F32,
+    BT_F64,
+    BT_NULL
+} base_type_e;
+
+/* @class HsailCode
+ * the HsailCode class is used to store information
+ * about HSA kernels stored in the BRIG format. it holds
+ * all information about a kernel, function, or variable
+ * symbol and provides methods for accessing that
+ * information.
+ */
+
+class HsailCode final : public HsaCode
+{
+  public:
+    TheGpuISA::Decoder decoder;
+
+    StorageMap *storageMap;
+    LabelMap labelMap;
+    uint32_t kernarg_start;
+    uint32_t kernarg_end;
+    int32_t private_size;
+
+    int32_t readonly_size;
+
+    // We track the maximum register index used for each register
+    // class when we load the code so we can size the register files
+    // appropriately (i.e., one more than the max index).
+    uint32_t max_creg;    // maximum c-register index
+    uint32_t max_sreg;    // maximum s-register index
+    uint32_t max_dreg;    // maximum d-register index
+
+    HsailCode(const std::string &name_str,
+              const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj,
+              StorageMap *objStorageMap);
+
+    // this version is used to create a placeholder when
+    // we encounter a kernel-related directive before the
+    // kernel itself
+    HsailCode(const std::string &name_str);
+
+    void init(const Brig::BrigDirectiveExecutable *code_dir,
+              const BrigObject *obj, StorageMap *objStorageMap);
+
+    void
+    generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
+    {
+        hsaKernelInfo->sRegCount = max_sreg + 1;
+        hsaKernelInfo->dRegCount = max_dreg + 1;
+        hsaKernelInfo->cRegCount = max_creg + 1;
+
+        hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
+
+        hsaKernelInfo->private_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
+
+        hsaKernelInfo->spill_mem_size =
+            roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
+    }
+
+    int
+    getSize(MemorySegment segment) const
+    {
+        Brig::BrigSegment brigSeg;
+
+        switch (segment) {
+          case MemorySegment::NONE:
+            brigSeg = Brig::BRIG_SEGMENT_NONE;
+            break;
+          case MemorySegment::FLAT:
+            brigSeg = Brig::BRIG_SEGMENT_FLAT;
+            break;
+          case MemorySegment::GLOBAL:
+            brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
+            break;
+          case MemorySegment::READONLY:
+            brigSeg = Brig::BRIG_SEGMENT_READONLY;
+            break;
+          case MemorySegment::KERNARG:
+            brigSeg = Brig::BRIG_SEGMENT_KERNARG;
+            break;
+          case MemorySegment::GROUP:
+            brigSeg = Brig::BRIG_SEGMENT_GROUP;
+            break;
+          case MemorySegment::PRIVATE:
+            brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
+            break;
+          case MemorySegment::SPILL:
+            brigSeg = Brig::BRIG_SEGMENT_SPILL;
+            break;
+          case MemorySegment::ARG:
+            brigSeg = Brig::BRIG_SEGMENT_ARG;
+            break;
+          case MemorySegment::EXTSPACE0:
+            brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
+            break;
+          default:
+            fatal("Unknown BrigSegment type.\n");
+        }
+
+        return getSize(brigSeg);
+    }
+
+  private:
+    int
+    getSize(Brig::BrigSegment segment) const
+    {
+        if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
+            // with the code generated by new HSA compiler the assertion
+            // does not hold anymore..
+            //assert(private_size != -1);
+            return private_size;
+        } else {
+            return storageMap->getSize(segment);
+        }
+    }
+
+  public:
+    StorageElement*
+    findSymbol(Brig::BrigSegment segment, uint64_t addr)
+    {
+        return storageMap->findSymbol(segment, addr);
+    }
+
+    void
+    setPrivateSize(int32_t _private_size)
+    {
+        private_size = _private_size;
+    }
+
+    Label*
+    refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
+    {
+        return labelMap.refLabel(lbl, obj);
+    }
+};
+
+#endif // __HSAIL_CODE_HH__
diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc
new file mode 100644
index 000000000..7e0e10912
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/kernel_cfg.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+void
+ControlFlowInfo::assignImmediatePostDominators(
+        const std::vector<GPUStaticInst*>& instructions)
+{
+    ControlFlowInfo cfg(instructions);
+    cfg.findImmediatePostDominators();
+}
+
+
+ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
+        instructions(insts)
+{
+    createBasicBlocks();
+    connectBasicBlocks();
+}
+
+BasicBlock*
+ControlFlowInfo::basicBlock(int inst_num) const {
+    for (auto& block: basicBlocks) {
+       int first_block_id = block->firstInstruction->instNum();
+       if (inst_num >= first_block_id &&
+               inst_num < first_block_id + block->size) {
+           return block.get();
+       }
+    }
+    return nullptr;
+}
+
+
+GPUStaticInst*
+ControlFlowInfo::lastInstruction(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+
+    return instructions.at(block->firstInstruction->instNum() +
+                           block->size - 1);
+}
+
+BasicBlock*
+ControlFlowInfo::postDominator(const BasicBlock* block) const
+{
+    if (block->isExit()) {
+        return nullptr;
+    }
+    return basicBlock(lastInstruction(block)->ipdInstNum());
+}
+
+void
+ControlFlowInfo::createBasicBlocks()
+{
+    assert(!instructions.empty());
+    std::set<int> leaders;
+    // first instruction is a leader
+    leaders.insert(0);
+    for (int i = 1; i < instructions.size(); i++) {
+        GPUStaticInst* instruction = instructions[i];
+        if (instruction->o_type == Enums::OT_BRANCH) {
+            const int target_pc = instruction->getTargetPc();
+            leaders.insert(target_pc);
+            leaders.insert(i + 1);
+        }
+    }
+
+    size_t block_size = 0;
+    for (int i = 0; i < instructions.size(); i++) {
+        if (leaders.find(i) != leaders.end()) {
+            uint32_t id = basicBlocks.size();
+            if (id > 0) {
+                basicBlocks.back()->size = block_size;
+            }
+            block_size = 0;
+            basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
+        }
+        block_size++;
+    }
+    basicBlocks.back()->size = block_size;
+    // exit basic block
+    basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
+}
+
+void
+ControlFlowInfo::connectBasicBlocks()
+{
+    BasicBlock* exit_bb = basicBlocks.back().get();
+    for (auto& bb : basicBlocks) {
+        if (bb->isExit()) {
+            break;
+        }
+        GPUStaticInst* last = lastInstruction(bb.get());
+        if (last->o_type == Enums::OT_RET) {
+            bb->successorIds.insert(exit_bb->id);
+            break;
+        }
+        if (last->o_type == Enums::OT_BRANCH) {
+            const uint32_t target_pc = last->getTargetPc();
+            BasicBlock* target_bb = basicBlock(target_pc);
+            bb->successorIds.insert(target_bb->id);
+        }
+
+        // Unconditional jump instructions have a unique successor
+        if (!last->unconditionalJumpInstruction()) {
+            BasicBlock* next_bb = basicBlock(last->instNum() + 1);
+            bb->successorIds.insert(next_bb->id);
+        }
+    }
+}
+
+
+// In-place set intersection
+static void
+intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
+{
+    std::set<uint32_t>::iterator it = a.begin();
+    while (it != a.end()) {
+        it = b.find(*it) != b.end() ? ++it : a.erase(it);
+    }
+}
+
+
+void
+ControlFlowInfo::findPostDominators()
+{
+    // the only postdominator of the exit block is itself
+    basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
+    //copy all basic blocks to all postdominator lists except for exit block
+    for (auto& block : basicBlocks) {
+        if (!block->isExit()) {
+            for (uint32_t i = 0; i < basicBlocks.size(); i++) {
+                block->postDominatorIds.insert(i);
+            }
+        }
+    }
+
+    bool change = true;
+    while (change) {
+        change = false;
+        for (int h = basicBlocks.size() - 2; h >= 0; --h) {
+            size_t num_postdominators =
+                    basicBlocks[h]->postDominatorIds.size();
+            for (int s : basicBlocks[h]->successorIds) {
+                intersect(basicBlocks[h]->postDominatorIds,
+                          basicBlocks[s]->postDominatorIds);
+            }
+            basicBlocks[h]->postDominatorIds.insert(h);
+            change |= (num_postdominators
+                    != basicBlocks[h]->postDominatorIds.size());
+        }
+    }
+}
+
+
+// In-place set difference
+static void
+setDifference(std::set<uint32_t>&a,
+           const std::set<uint32_t>& b, uint32_t exception)
+{
+    for (uint32_t b_elem : b) {
+        if (b_elem != exception) {
+            a.erase(b_elem);
+        }
+    }
+}
+
+void
+ControlFlowInfo::findImmediatePostDominators()
+{
+    assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
+
+    findPostDominators();
+
+    for (auto& basicBlock : basicBlocks) {
+        if (basicBlock->isExit()) {
+            continue;
+        }
+        std::set<uint32_t> candidates = basicBlock->postDominatorIds;
+        candidates.erase(basicBlock->id);
+        for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
+            if (postDominatorId != basicBlock->id) {
+                setDifference(candidates,
+                           basicBlocks[postDominatorId]->postDominatorIds,
+                           postDominatorId);
+            }
+        }
+        assert(candidates.size() == 1);
+        GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
+        BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
+        if (!ipd_block->isExit()) {
+            GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
+            last_instruction->ipdInstNum(ipd_first_inst->instNum());
+        } else {
+            last_instruction->ipdInstNum(last_instruction->instNum() + 1);
+        }
+    }
+}
+
+void
+ControlFlowInfo::printPostDominators() const
+{
+    for (auto& block : basicBlocks) {
+        std::cout << "PD(" << block->id << ") = {";
+        std::copy(block->postDominatorIds.begin(),
+                  block->postDominatorIds.end(),
+                  std::ostream_iterator<uint32_t>(std::cout, ", "));
+        std::cout << "}" << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printImmediatePostDominators() const
+{
+    for (const auto& block : basicBlocks) {
+        if (block->isExit()) {
+            continue;
+        }
+        std::cout << "IPD(" << block->id << ") = ";
+        std::cout << postDominator(block.get())->id << ", ";
+    }
+    std::cout << std::endl;
+}
+void
+ControlFlowInfo::printBasicBlocks() const
+{
+    for (GPUStaticInst* inst : instructions) {
+        int inst_num = inst->instNum();
+        std::cout << inst_num << " [" << basicBlock(inst_num)->id
+                << "]: " << inst->disassemble();
+        if (inst->o_type == Enums::OT_BRANCH) {
+            std::cout << ", PC = " << inst->getTargetPc();
+        }
+        std::cout << std::endl;
+    }
+}
+
+void
+ControlFlowInfo::printBasicBlockDot() const
+{
+    printf("digraph {\n");
+    for (const auto& basic_block : basicBlocks) {
+        printf("\t");
+        for (uint32_t successorId : basic_block->successorIds) {
+            printf("%d -> %d; ", basic_block->id, successorId);
+        }
+        printf("\n");
+    }
+    printf("}\n");
+}
diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh
new file mode 100644
index 000000000..74ea861d8
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __KERNEL_CFG_HH__
+#define __KERNEL_CFG_HH__
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <vector>
+
+
+class GPUStaticInst;
+class HsailCode;
+
+struct BasicBlock
+{
+    BasicBlock(uint32_t num, GPUStaticInst* begin) :
+            id(num), size(0), firstInstruction(begin)
+    {
+    }
+
+    bool
+    isEntry() const
+    {
+        return !id;
+    }
+
+    bool
+    isExit() const
+    {
+        return !size;
+    }
+
+    /**
+     * Unique identifier for the block within a given kernel.
+     */
+    const uint32_t id;
+
+    /**
+     * Number of instructions contained in the block
+     */
+    size_t size;
+
+    /**
+     * Pointer to first instruction of the block.
+     */
+    GPUStaticInst* firstInstruction;
+
+    /**
+     * Identifiers of the blocks that follow (are reachable from) this block.
+     */
+    std::set<uint32_t> successorIds;
+
+    /**
+     * Identifiers of the blocks that will be visited from this block.
+     */
+    std::set<uint32_t> postDominatorIds;
+};
+
+class ControlFlowInfo
+{
+public:
+
+    /**
+     * Compute immediate post-dominator instruction for kernel instructions.
+     */
+    static void assignImmediatePostDominators(
+            const std::vector<GPUStaticInst*>& instructions);
+
+private:
+    ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
+
+    GPUStaticInst* lastInstruction(const BasicBlock* block) const;
+
+    BasicBlock* basicBlock(int inst_num) const;
+
+    BasicBlock* postDominator(const BasicBlock* block) const;
+
+    void createBasicBlocks();
+
+    void connectBasicBlocks();
+
+    void findPostDominators();
+
+    void findImmediatePostDominators();
+
+    void printBasicBlocks() const;
+
+    void printBasicBlockDot() const;
+
+    void printPostDominators() const;
+
+    void printImmediatePostDominators() const;
+
+    std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
+    std::vector<GPUStaticInst*> instructions;
+};
+
+#endif // __KERNEL_CFG_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
new file mode 100644
index 000000000..91ee8009a
--- /dev/null
+++ b/src/gpu-compute/lds_state.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#include "gpu-compute/lds_state.hh"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+/**
+ * the default constructor that works with SWIG
+ */
+LdsState::LdsState(const Params *params) :
+    MemObject(params),
+    tickEvent(this),
+    cuPort(name() + ".port", this),
+    maximumSize(params->size),
+    range(params->range),
+    bankConflictPenalty(params->bankConflictPenalty),
+    banks(params->banks)
+{
+    fatal_if(params->banks <= 0,
+             "Number of LDS banks should be positive number");
+    fatal_if((params->banks & (params->banks - 1)) != 0,
+             "Number of LDS banks should be a power of 2");
+    fatal_if(params->size <= 0,
+             "cannot allocate an LDS with a size less than 1");
+    fatal_if(params->size % 2,
+          "the LDS should be an even number");
+}
+
+/**
+ * Needed by the SWIG compiler
+ */
+LdsState *
+LdsStateParams::create()
+{
+    return new LdsState(this);
+}
+
+/**
+ * set the parent and name based on the parent
+ */
+void
+LdsState::setParent(ComputeUnit *x_parent)
+{
+    // check that this gets assigned to the same thing each time
+    fatal_if(!x_parent, "x_parent should not be nullptr");
+    fatal_if(x_parent == parent,
+             "should not be setting the parent twice");
+
+    parent = x_parent;
+    _name = x_parent->name() + ".LdsState";
+}
+
+/**
+ * derive the gpu mem packet from the packet and then count the bank conflicts
+ */
+unsigned
+LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
+{
+    Packet::SenderState *baseSenderState = packet->senderState;
+    while (baseSenderState->predecessor) {
+        baseSenderState = baseSenderState->predecessor;
+    }
+    const ComputeUnit::LDSPort::SenderState *senderState =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
+
+    fatal_if(!senderState,
+             "did not get the right sort of sender state");
+
+    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+    return countBankConflicts(gpuDynInst, bankAccesses);
+}
+
+// Count the total number of bank conflicts for the local memory packet
+unsigned
+LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
+                             unsigned *numBankAccesses)
+{
+    int bank_conflicts = 0;
+    std::vector<int> bank;
+    // the number of LDS banks being touched by the memory instruction
+    int numBanks = std::min(parent->wfSize(), banks);
+    // if the wavefront size is larger than the number of LDS banks, we
+    // need to iterate over all work items to calculate the total
+    // number of bank conflicts
+    int groups = (parent->wfSize() > numBanks) ?
+        (parent->wfSize() / numBanks) : 1;
+    for (int i = 0; i < groups; i++) {
+        // Address Array holding all the work item addresses of an instruction
+        std::vector<Addr> addr_array;
+        addr_array.resize(numBanks, 0);
+        bank.clear();
+        bank.resize(banks, 0);
+        int max_bank = 0;
+
+        // populate the address array for all active work items
+        for (int j = 0; j < numBanks; j++) {
+            if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
+                addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
+            } else {
+                addr_array[j] = std::numeric_limits<Addr>::max();
+            }
+        }
+
+        if (gpuDynInst->m_op == Enums::MO_LD ||
+            gpuDynInst->m_op == Enums::MO_ST) {
+            // mask identical addresses
+            for (int j = 0; j < numBanks; ++j) {
+                for (int j0 = 0; j0 < j; j0++) {
+                    if (addr_array[j] != std::numeric_limits<Addr>::max()
+                                    && addr_array[j] == addr_array[j0]) {
+                        addr_array[j] = std::numeric_limits<Addr>::max();
+                    }
+                }
+            }
+        }
+        // calculate bank conflicts
+        for (int j = 0; j < numBanks; ++j) {
+            if (addr_array[j] != std::numeric_limits<Addr>::max()) {
+                int bankId = addr_array[j] % banks;
+                bank[bankId]++;
+                max_bank = std::max(max_bank, bank[bankId]);
+                // Count the number of LDS banks accessed.
+                // Since we have masked identical addresses all remaining
+                // accesses will need to be serialized if they access
+                // the same bank (bank conflict).
+                (*numBankAccesses)++;
+            }
+        }
+        bank_conflicts += max_bank;
+    }
+    panic_if(bank_conflicts > parent->wfSize(),
+             "Max bank conflicts should match num of work items per instr");
+    return bank_conflicts;
+}
+
+/**
+ * receive the packet from the CU
+ */
+bool
+LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
+{
+    return ownerLds->processPacket(packet);
+}
+
+GPUDynInstPtr
+LdsState::getDynInstr(PacketPtr packet)
+{
+    ComputeUnit::LDSPort::SenderState *ss =
+        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                     packet->senderState);
+    return ss->getMemInst();
+}
+
+/**
+ * process an incoming packet, add it to the return queue
+ */
+bool
+LdsState::processPacket(PacketPtr packet)
+{
+    unsigned bankAccesses = 0;
+    // the number of conflicts this packet will have when accessing the LDS
+    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
+    // count the total number of physical LDS bank accessed
+    parent->ldsBankAccesses += bankAccesses;
+    // count the LDS bank conflicts. A number set to 1 indicates one
+    // access per bank maximum so there are no bank conflicts
+    parent->ldsBankConflictDist.sample(bankConflicts-1);
+
+    GPUDynInstPtr dynInst = getDynInstr(packet);
+    // account for the LDS bank conflict overhead
+    int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
+        (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
+        parent->loadBusLength();
+    // delay for accessing the LDS
+    Tick processingTime =
+        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
+        parent->shader->ticks(busLength);
+    // choose (delay + last packet in queue) or (now + delay) as the time to
+    // return this
+    Tick doneAt = earliestReturnTime() + processingTime;
+    // then store it for processing
+    return returnQueuePush(std::make_pair(doneAt, packet));
+}
+
+/**
+ * add this to the queue of packets to be returned
+ */
+bool
+LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
+{
+    // TODO add time limits (e.g. one packet per cycle) and queue size limits
+    // and implement flow control
+    returnQueue.push(thePair);
+
+    // if there is no set wakeup time, look through the queue
+    if (!tickEvent.scheduled()) {
+        process();
+    }
+
+    return true;
+}
+
+/**
+ * receive a packet in functional mode
+ */
+void
+LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("not implemented");
+}
+
+/**
+ * receive a retry for a response
+ */
+void
+LdsState::CuSidePort::recvRespRetry()
+{
+    // TODO verify that this is the right way to do this
+    assert(ownerLds->isRetryResp());
+    ownerLds->setRetryResp(false);
+    ownerLds->process();
+}
+
+/**
+ * receive a retry
+ */
+void
+LdsState::CuSidePort::recvRetry()
+{
+    fatal("not implemented");
+}
+
+/**
+ * look for packets to return at this time
+ */
+bool
+LdsState::process()
+{
+    Tick now = clockEdge();
+
+    // send back completed packets
+    while (!returnQueue.empty() && returnQueue.front().first <= now) {
+        PacketPtr packet = returnQueue.front().second;
+
+        ComputeUnit::LDSPort::SenderState *ss =
+            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+                            packet->senderState);
+
+        GPUDynInstPtr gpuDynInst = ss->getMemInst();
+
+        gpuDynInst->initiateAcc(gpuDynInst);
+
+        packet->makeTimingResponse();
+
+        returnQueue.pop();
+
+        bool success = cuPort.sendTimingResp(packet);
+
+        if (!success) {
+            retryResp = true;
+            panic("have not handled timing responses being NACK'd when sent"
+                            "back");
+        }
+    }
+
+    // determine the next wakeup time
+    if (!returnQueue.empty()) {
+
+        Tick next = returnQueue.front().first;
+
+        if (tickEvent.scheduled()) {
+
+            if (next < tickEvent.when()) {
+
+                tickEvent.deschedule();
+                tickEvent.schedule(next);
+            }
+        } else {
+            tickEvent.schedule(next);
+        }
+    }
+
+    return true;
+}
+
+/**
+ * wake up at this time and perform specified actions
+ */
+void
+LdsState::TickEvent::process()
+{
+    ldsState->process();
+}
+
+/**
+ *
+ */
+void
+LdsState::regStats()
+{
+}
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
new file mode 100644
index 000000000..89f08a1d3
--- /dev/null
+++ b/src/gpu-compute/lds_state.hh
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#ifndef __LDS_STATE_HH__
+#define __LDS_STATE_HH__
+
+#include <array>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/misc.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "params/LdsState.hh"
+
+class ComputeUnit;
+
+/**
+ * this represents a slice of the overall LDS, intended to be associated with an
+ * individual workgroup
+ */
+class LdsChunk
+{
+  public:
+    LdsChunk(const uint32_t x_size):
+        chunk(x_size)
+    {
+    }
+
+    LdsChunk() {}
+
+    /**
+     * a read operation
+     */
+    template<class T>
+    T
+    read(const uint32_t index)
+    {
+        fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        return *p0;
+    }
+
+    /**
+     * a write operation
+     */
+    template<class T>
+    void
+    write(const uint32_t index, const T value)
+    {
+        fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
+        fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+        T *p0 = (T *) (&(chunk.at(index)));
+        *p0 = value;
+    }
+
+    /**
+     * get the size of this chunk
+     */
+    std::vector<uint8_t>::size_type
+    size() const
+    {
+        return chunk.size();
+    }
+
+  protected:
+    // the actual data store for this slice of the LDS
+    std::vector<uint8_t> chunk;
+};
+
+// Local Data Share (LDS) State per Wavefront (contents of the LDS region
+// allocated to the WorkGroup of this Wavefront)
+class LdsState: public MemObject
+{
+  protected:
+
+    /**
+     * an event to allow event-driven execution
+     */
+    class TickEvent: public Event
+    {
+      protected:
+
+        LdsState *ldsState = nullptr;
+
+        Tick nextTick = 0;
+
+      public:
+
+        TickEvent(LdsState *_ldsState) :
+            ldsState(_ldsState)
+        {
+        }
+
+        virtual void
+        process();
+
+        void
+        schedule(Tick when)
+        {
+            mainEventQueue[0]->schedule(this, when);
+        }
+
+        void
+        deschedule()
+        {
+            mainEventQueue[0]->deschedule(this);
+        }
+    };
+
+    /**
+     * CuSidePort is the LDS Port closer to the CU side
+     */
+    class CuSidePort: public SlavePort
+    {
+      public:
+        CuSidePort(const std::string &_name, LdsState *_ownerLds) :
+                SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
+        {
+        }
+
+      protected:
+        LdsState *ownerLds;
+
+        virtual bool
+        recvTimingReq(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt)
+        {
+          return 0;
+        }
+
+        virtual void
+        recvFunctional(PacketPtr pkt);
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvRetry();
+
+        virtual void
+        recvRespRetry();
+
+        virtual AddrRangeList
+        getAddrRanges() const
+        {
+          AddrRangeList ranges;
+          ranges.push_back(ownerLds->getAddrRange());
+          return ranges;
+        }
+
+        template<typename T>
+        void
+        loadData(PacketPtr packet);
+
+        template<typename T>
+        void
+        storeData(PacketPtr packet);
+
+        template<typename T>
+        void
+        atomicOperation(PacketPtr packet);
+    };
+
+  protected:
+
+    // the lds reference counter
+    // The key is the workgroup ID and dispatch ID
+    // The value is the number of wavefronts that reference this LDS, as
+    // wavefronts are launched, the counter goes up for that workgroup and when
+    // they return it decreases, once it reaches 0 then this chunk of the LDS is
+    // returned to the available pool. However,it is deallocated on the 1->0
+    // transition, not whenever the counter is 0 as it always starts with 0 when
+    // the workgroup asks for space
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, int32_t>> refCounter;
+
+    // the map that allows workgroups to access their own chunk of the LDS
+    std::unordered_map<uint32_t,
+                       std::unordered_map<uint32_t, LdsChunk>> chunkMap;
+
+    // an event to allow the LDS to wake up at a specified time
+    TickEvent tickEvent;
+
+    // the queue of packets that are going back to the CU after a
+    // read/write/atomic op
+    // TODO need to make this have a maximum size to create flow control
+    std::queue<std::pair<Tick, PacketPtr>> returnQueue;
+
+    // whether or not there are pending responses
+    bool retryResp = false;
+
+    bool
+    process();
+
+    GPUDynInstPtr
+    getDynInstr(PacketPtr packet);
+
+    bool
+    processPacket(PacketPtr packet);
+
+    unsigned
+    countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
+
+    unsigned
+    countBankConflicts(GPUDynInstPtr gpuDynInst,
+                       unsigned *numBankAccesses);
+
+  public:
+    typedef LdsStateParams Params;
+
+    LdsState(const Params *params);
+
+    // prevent copy construction
+    LdsState(const LdsState&) = delete;
+
+    ~LdsState()
+    {
+        parent = nullptr;
+    }
+
+    const Params *
+    params() const
+    {
+        return dynamic_cast<const Params *>(_params);
+    }
+
+    bool
+    isRetryResp() const
+    {
+        return retryResp;
+    }
+
+    void
+    setRetryResp(const bool value)
+    {
+        retryResp = value;
+    }
+
+    // prevent assignment
+    LdsState &
+    operator=(const LdsState &) = delete;
+
+    /**
+     * use the dynamic wave id to create or just increase the reference count
+     */
+    int
+    increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+        int refCount = getRefCounter(dispatchId, wgId);
+        fatal_if(refCount < 0,
+                 "reference count should not be below zero");
+        return ++refCounter[dispatchId][wgId];
+    }
+
+    /**
+     * decrease the reference count after making sure it is in the list
+     * give back this chunk if the ref counter has reached 0
+     */
+    int
+    decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+    {
+      int refCount = getRefCounter(dispatchId, wgId);
+
+      fatal_if(refCount <= 0,
+              "reference count should not be below zero or at zero to"
+              "decrement");
+
+      refCounter[dispatchId][wgId]--;
+
+      if (refCounter[dispatchId][wgId] == 0) {
+        releaseSpace(dispatchId, wgId);
+        return 0;
+      } else {
+        return refCounter[dispatchId][wgId];
+      }
+    }
+
+    /**
+     * return the current reference count for this workgroup id
+     */
+    int
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+    {
+      auto dispatchIter = chunkMap.find(dispatchId);
+      fatal_if(dispatchIter == chunkMap.end(),
+               "could not locate this dispatch id [%d]", dispatchId);
+
+      auto workgroup = dispatchIter->second.find(wgId);
+      fatal_if(workgroup == dispatchIter->second.end(),
+               "could not find this workgroup id within this dispatch id"
+               " did[%d] wgid[%d]", dispatchId, wgId);
+
+      auto refCountIter = refCounter.find(dispatchId);
+      if (refCountIter == refCounter.end()) {
+        fatal("could not locate this dispatch id [%d]", dispatchId);
+      } else {
+        auto workgroup = refCountIter->second.find(wgId);
+        if (workgroup == refCountIter->second.end()) {
+          fatal("could not find this workgroup id within this dispatch id"
+                  " did[%d] wgid[%d]", dispatchId, wgId);
+        } else {
+          return refCounter.at(dispatchId).at(wgId);
+        }
+      }
+
+      fatal("should not reach this point");
+      return 0;
+    }
+
+    /**
+     * assign a parent and request this amount of space be set aside
+     * for this wgid
+     */
+    LdsChunk *
+    reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
+            const uint32_t size)
+    {
+        if (chunkMap.find(dispatchId) != chunkMap.end()) {
+            fatal_if(
+                chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
+                "duplicate workgroup ID asking for space in the LDS "
+                "did[%d] wgid[%d]", dispatchId, wgId);
+        }
+
+        fatal_if(bytesAllocated + size > maximumSize,
+                 "request would ask for more space than is available");
+
+        bytesAllocated += size;
+
+        chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+        // make an entry for this workgroup
+        refCounter[dispatchId][wgId] = 0;
+
+        return &chunkMap[dispatchId][wgId];
+    }
+
+    bool
+    returnQueuePush(std::pair<Tick, PacketPtr> thePair);
+
+    Tick
+    earliestReturnTime() const
+    {
+        // TODO set to max(lastCommand+1, curTick())
+        return returnQueue.empty() ? curTick() : returnQueue.back().first;
+    }
+
+    void
+    setParent(ComputeUnit *x_parent);
+
+    void
+    regStats();
+
+    // accessors
+    ComputeUnit *
+    getParent() const
+    {
+        return parent;
+    }
+
+    std::string
+    getName()
+    {
+        return _name;
+    }
+
+    int
+    getBanks() const
+    {
+        return banks;
+    }
+
+    ComputeUnit *
+    getComputeUnit() const
+    {
+        return parent;
+    }
+
+    int
+    getBankConflictPenalty() const
+    {
+        return bankConflictPenalty;
+    }
+
+    /**
+     * get the allocated size for this workgroup
+     */
+    std::size_t
+    ldsSize(const uint32_t x_wgId)
+    {
+        return chunkMap[x_wgId].size();
+    }
+
+    AddrRange
+    getAddrRange() const
+    {
+        return range;
+    }
+
+    virtual BaseSlavePort &
+    getSlavePort(const std::string& if_name, PortID idx)
+    {
+        if (if_name == "cuPort") {
+            // TODO need to set name dynamically at this point?
+            return cuPort;
+        } else {
+            fatal("cannot resolve the port name " + if_name);
+        }
+    }
+
+    /**
+     * can this much space be reserved for a workgroup?
+     */
+    bool
+    canReserve(uint32_t x_size) const
+    {
+      return bytesAllocated + x_size <= maximumSize;
+    }
+
+  private:
+    /**
+     * give back the space
+     */
+    bool
+    releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
+    {
+        auto dispatchIter = chunkMap.find(x_dispatchId);
+
+        if (dispatchIter == chunkMap.end()) {
+          fatal("dispatch id not found [%d]", x_dispatchId);
+        } else {
+          auto workgroupIter = dispatchIter->second.find(x_wgId);
+          if (workgroupIter == dispatchIter->second.end()) {
+            fatal("workgroup id [%d] not found in dispatch id [%d]",
+                    x_wgId, x_dispatchId);
+          }
+        }
+
+        fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
+                 "releasing more space than was allocated");
+
+        bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
+        chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
+        return true;
+    }
+
+    // the port that connects this LDS to its owner CU
+    CuSidePort cuPort;
+
+    ComputeUnit* parent = nullptr;
+
+    std::string _name;
+
+    // the number of bytes currently reserved by all workgroups
+    int bytesAllocated = 0;
+
+    // the size of the LDS, the most bytes available
+    int maximumSize;
+
+    // Address range of this memory
+    AddrRange range;
+
+    // the penalty, in cycles, for each LDS bank conflict
+    int bankConflictPenalty = 0;
+
+    // the number of banks in the LDS underlying data store
+    int banks = 0;
+};
+
+#endif // __LDS_STATE_HH__
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
new file mode 100644
index 000000000..7f919c5f4
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/local_memory_pipeline.hh"
+
+#include "debug/GPUPort.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
+    computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
+{
+}
+
+void
+LocalMemPipeline::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".LocalMemPipeline";
+}
+
+void
+LocalMemPipeline::exec()
+{
+    // apply any returned shared (LDS) memory operations
+    GPUDynInstPtr m = !lmReturnedRequests.empty() ?
+        lmReturnedRequests.front() : nullptr;
+
+    bool accessVrf = true;
+    if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+        accessVrf =
+            w->computeUnit->vrf[m->simdId]->
+            vrfOperandAccessReady(m->seqNum(), w, m,
+                                  VrfAccessType::WRITE);
+    }
+
+    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
+        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
+                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
+        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+            doSmReturn<uint32_t, uint8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+            doSmReturn<uint32_t, uint16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+            doSmReturn<uint32_t, uint32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            doSmReturn<int32_t, int8_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+            doSmReturn<int32_t, int16_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            doSmReturn<int32_t, int32_t>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            doSmReturn<float, Float16>(m);
+        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+            doSmReturn<float, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+            doSmReturn<uint64_t, uint8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+            doSmReturn<uint64_t, uint16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+            doSmReturn<uint64_t, uint32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+            doSmReturn<uint64_t, uint64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            doSmReturn<int64_t, int8_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+            doSmReturn<int64_t, int16_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+            doSmReturn<int64_t, int32_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+            doSmReturn<int64_t, int64_t>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+            doSmReturn<double, Float16>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+            doSmReturn<double, float>(m);
+        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+            doSmReturn<double, double>(m);
+    }
+
+    // If pipeline has executed a local memory instruction
+    // execute local memory packet and issue the packets
+    // to LDS
+    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
+
+        GPUDynInstPtr m = lmIssuedRequests.front();
+
+        bool returnVal = computeUnit->sendToLds(m);
+        if (!returnVal) {
+            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
+        }
+        lmIssuedRequests.pop();
+    }
+}
+
+template<typename c0, typename c1>
+void
+LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
+{
+    lmReturnedRequests.pop();
+    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+    // Return data to registers
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+        std::vector<uint32_t> regVec;
+        for (int k = 0; k < m->n_reg; ++k) {
+            int dst = m->dst_reg+k;
+
+            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+                dst = m->dst_reg_vec[k];
+            // virtual->physical VGPR mapping
+            int physVgpr = w->remap(dst,sizeof(c0),1);
+            // save the physical VGPR index
+            regVec.push_back(physVgpr);
+            c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+            for (int i = 0; i < VSZ; ++i) {
+                if (m->exec_mask[i]) {
+                    // write the value into the physical VGPR. This is a purely
+                    // functional operation. No timing is modeled.
+                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+                                                                *p1, i);
+                }
+                ++p1;
+            }
+        }
+
+        // Schedule the write operation of the load data on the VRF. This simply
+        // models the timing aspect of the VRF write operation. It does not
+        // modify the physical VGPR.
+        loadVrfBankConflictCycles +=
+            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
+                                                 regVec, sizeof(c0), m->time);
+    }
+
+    // Decrement outstanding request count
+    computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+    if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
+        || MO_H(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
+                                         m->time, -1);
+    }
+
+    if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+        computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
+                                         m->time, -1);
+    }
+
+    // Mark write bus busy for appropriate amount of time
+    computeUnit->locMemToVrfBus.set(m->time);
+    if (computeUnit->shader->coissue_return == 0)
+        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+LocalMemPipeline::regStats()
+{
+    loadVrfBankConflictCycles
+        .name(name() + ".load_vrf_bank_conflict_cycles")
+        .desc("total number of cycles LDS data are delayed before updating "
+              "the VRF")
+        ;
+}
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
new file mode 100644
index 000000000..a63d867d0
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __LOCAL_MEMORY_PIPELINE_HH__
+#define __LOCAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file local_memory_pipeline.hh
+ *
+ * The local memory pipeline issues newly created local memory packets
+ * from pipeline to the LDS. This stage also retires previously issued
+ * loads and stores that have returned from the LDS.
+ */
+
+class ComputeUnit;
+class Wavefront;
+
+class LocalMemPipeline
+{
+  public:
+    LocalMemPipeline(const ComputeUnitParams *params);
+    void init(ComputeUnit *cu);
+    void exec();
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
+
+    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
+    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+
+    bool
+    isLMRespFIFOWrRdy() const
+    {
+        return lmReturnedRequests.size() < lmQueueSize;
+    }
+
+    bool
+    isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
+    {
+        return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
+    }
+
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    int lmQueueSize;
+    Stats::Scalar loadVrfBankConflictCycles;
+    // Local Memory Request Fifo: all shared memory requests
+    // are issued to this FIFO from the memory pipelines
+    std::queue<GPUDynInstPtr> lmIssuedRequests;
+
+    // Local Memory Response Fifo: all responses of shared memory
+    // requests are sent to this FIFO from LDS
+    std::queue<GPUDynInstPtr> lmReturnedRequests;
+};
+
+#endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
new file mode 100644
index 000000000..4f8032832
--- /dev/null
+++ b/src/gpu-compute/misc.hh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __MISC_HH__
+#define __MISC_HH__
+
+#include <bitset>
+#include <memory>
+
+#include "base/misc.hh"
+
+class GPUDynInst;
+
+// wavefront size of the machine
+static const int VSZ = 64;
+
+/*
+ This check is necessary because std::bitset only provides conversion to
+ unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
+ a few places in the code where to_ullong() is used, however if VSZ is larger
+ than a value the host can support then bitset will throw a runtime exception.
+
+ we should remove all use of to_long() or to_ullong() so we can have VSZ
+ greater than 64b, however until that is done this assert is required.
+ */
+static_assert(VSZ <= sizeof(unsigned long long) * 8,
+              "VSZ is larger than the host can support");
+
+typedef std::bitset<VSZ> VectorMask;
+typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+
+class WaitClass
+{
+  public:
+    WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
+    void init(uint64_t *_tcnt, uint32_t _numStages=0)
+    {
+        tcnt = _tcnt;
+        numStages = _numStages;
+    }
+
+    void set(uint32_t i)
+    {
+        fatal_if(nxtAvail > *tcnt,
+                 "Can't allocate resource because it is busy!!!");
+        nxtAvail = *tcnt + i;
+    }
+    void preset(uint32_t delay)
+    {
+        lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+    }
+    bool rdy() const { return *tcnt >= nxtAvail; }
+    bool prerdy() const { return *tcnt >= lookAheadAvail; }
+
+  private:
+    // timestamp indicating when resource will be available
+    uint64_t nxtAvail;
+    // timestamp indicating when resource will be available including
+    // pending uses of the resource (when there is a cycle gap between
+    // rdy() and set()
+    uint64_t lookAheadAvail;
+    // current timestamp
+    uint64_t *tcnt;
+    // number of stages between checking if a resource is ready and
+    // setting the resource's utilization
+    uint32_t numStages;
+};
+
+class Float16
+{
+  public:
+    uint16_t val;
+
+    Float16() { val = 0; }
+
+    Float16(const Float16 &x) : val(x.val) { }
+
+    Float16(float x)
+    {
+        uint32_t ai = *(uint32_t *)&x;
+
+        uint32_t s = (ai >> 31) & 0x1;
+        uint32_t exp = (ai >> 23) & 0xff;
+        uint32_t mant = (ai >> 0) & 0x7fffff;
+
+        if (exp == 0 || exp <= 0x70) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0xff) {
+            exp = 0x1f;
+        } else if (exp >= 0x8f) {
+            exp = 0x1f;
+            mant = 0;
+        } else {
+            exp = exp - 0x7f + 0x0f;
+        }
+
+        mant = mant >> 13;
+
+        val = 0;
+        val |= (s << 15);
+        val |= (exp << 10);
+        val |= (mant << 0);
+    }
+
+    operator float() const
+    {
+        uint32_t s = (val >> 15) & 0x1;
+        uint32_t exp = (val >> 10) & 0x1f;
+        uint32_t mant = (val >> 0) & 0x3ff;
+
+        if (!exp) {
+            exp = 0;
+            mant = 0;
+        } else if (exp == 0x1f) {
+            exp = 0xff;
+        } else {
+            exp = exp - 0x0f + 0x7f;
+        }
+
+        uint32_t val1 = 0;
+        val1 |= (s << 31);
+        val1 |= (exp << 23);
+        val1 |= (mant << 13);
+
+        return *(float*)&val1;
+    }
+};
+
+#endif // __MISC_HH__
diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh
new file mode 100644
index 000000000..d1ad35d4b
--- /dev/null
+++ b/src/gpu-compute/ndrange.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __NDRANGE_HH__
+#define __NDRANGE_HH__
+
+#include "base/types.hh"
+#include "gpu-compute/qstruct.hh"
+
+struct NDRange
+{
+    // copy of the queue entry provided at dispatch
+    HsaQueueEntry q;
+
+    // The current workgroup id (3 dimensions)
+    int wgId[3];
+    // The number of workgroups in each dimension
+    int numWg[3];
+    // The total number of workgroups
+    int numWgTotal;
+
+    // The number of completed work groups
+    int numWgCompleted;
+    // The global workgroup ID
+    uint32_t globalWgId;
+
+    // flag indicating whether all work groups have been launched
+    bool wg_disp_rem;
+    // kernel complete
+    bool execDone;
+    bool userDoorBellSet;
+    volatile bool *addrToNotify;
+    volatile uint32_t *numDispLeft;
+    int dispatchId;
+    int curTid; // Current thread id
+};
+
+#endif // __NDRANGE_HH__
diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc
new file mode 100644
index 000000000..7f114706a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/of_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+OFSchedulingPolicy::chooseWave()
+{
+    // Set when policy choose a wave to schedule
+    bool waveChosen = false;
+    Wavefront *selectedWave = nullptr;
+    int selectedWaveID = -1;
+    uint32_t selectedPosition = 0;
+
+    for (int position = 0; position < scheduleList->size(); ++position) {
+        Wavefront *curWave = scheduleList->at(position);
+        uint32_t curWaveID = curWave->wfDynId;
+
+        // Choosed wave with the lowest wave ID
+        if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
+            waveChosen = true;
+            selectedWaveID = curWaveID;
+            selectedWave = curWave;
+            selectedPosition = position;
+        }
+    }
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (waveChosen) {
+        scheduleList->erase(scheduleList->begin() + selectedPosition);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh
new file mode 100644
index 000000000..684e51a3a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __OF_SCHEDULING_POLICY_HH__
+#define __OF_SCHEDULING_POLICY_HH__
+
+#include <cstddef>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Oldest First where age is marked by the wave id
+class OFSchedulingPolicy
+{
+  public:
+    OFSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects the oldest wave from this list
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __OF_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
new file mode 100644
index 000000000..b1bc6b1f3
--- /dev/null
+++ b/src/gpu-compute/pool_manager.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/pool_manager.hh"
+
+PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
+    : _minAllocation(minAlloc), _poolSize(poolSize)
+{
+    assert(poolSize > 0);
+}
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
new file mode 100644
index 000000000..2cb53ce72
--- /dev/null
+++ b/src/gpu-compute/pool_manager.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __POOL_MANAGER_HH__
+#define __POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+// Pool Manager Logic
+class PoolManager
+{
+  public:
+    PoolManager(uint32_t minAlloc, uint32_t poolSize);
+    uint32_t minAllocation() { return _minAllocation; }
+    virtual std::string printRegion() = 0;
+    virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
+    virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
+
+    virtual uint32_t allocateRegion(const uint32_t size,
+                                    uint32_t *reserved) = 0;
+
+    virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
+    uint32_t poolSize() { return _poolSize; }
+
+  private:
+    // minimum size that can be reserved per allocation
+    uint32_t _minAllocation;
+    // pool size in number of elements
+    uint32_t _poolSize;
+};
+
+#endif // __POOL_MANAGER_HH__
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
new file mode 100644
index 000000000..092303c00
--- /dev/null
+++ b/src/gpu-compute/qstruct.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __Q_STRUCT_HH__
+#define __Q_STRUCT_HH__
+
+#include <bitset>
+#include <cstdint>
+
+// Maximum number of arguments
+static const int KER_NUM_ARGS = 32;
+// Kernel argument buffer size
+static const int KER_ARGS_LENGTH = 512;
+
+class LdsChunk;
+struct NDRange;
+
+// Be very careful of alignment in this structure. The structure
+// must compile to the same layout in both 32-bit and 64-bit mode.
+struct HsaQueueEntry
+{
+    // Base pointer for array of instruction pointers
+    uint64_t code_ptr;
+    // Grid Size (3 dimensions)
+    uint32_t gdSize[3];
+    // Workgroup Size (3 dimensions)
+    uint32_t wgSize[3];
+    uint16_t sRegCount;
+    uint16_t dRegCount;
+    uint16_t cRegCount;
+    uint64_t privMemStart;
+    uint32_t privMemPerItem;
+    uint32_t privMemTotal;
+    uint64_t spillMemStart;
+    uint32_t spillMemPerItem;
+    uint32_t spillMemTotal;
+    uint64_t roMemStart;
+    uint32_t roMemTotal;
+    // Size (in bytes) of LDS
+    uint32_t ldsSize;
+    // Virtual Memory Id (unused right now)
+    uint32_t vmId;
+
+    // Pointer to dependency chain (unused now)
+    uint64_t depends;
+
+    // pointer to bool
+    uint64_t addrToNotify;
+    // pointer to uint32_t
+    uint64_t numDispLeft;
+
+    // variables to pass arguments when running in standalone mode,
+    // will be removed when run.py and sh.cpp have been updated to
+    // use args and offset arrays
+    uint64_t arg1;
+    uint64_t arg2;
+    uint64_t arg3;
+    uint64_t arg4;
+
+    // variables to pass arguments when running in cpu+gpu mode
+    uint8_t args[KER_ARGS_LENGTH];
+    uint16_t offsets[KER_NUM_ARGS];
+    uint16_t num_args;
+};
+
+// State used to start (or restart) a WF
+struct WFContext
+{
+    // 32 bit values
+    // barrier state
+    int bar_cnt[VSZ];
+
+    // id (which WF in the WG)
+    int cnt;
+
+    // more barrier state
+    int max_bar_cnt;
+    int old_barrier_cnt;
+    int barrier_cnt;
+
+    // More Program Counter Stuff
+    uint32_t pc;
+
+    // Program counter of the immediate post-dominator instruction
+    uint32_t rpc;
+
+    // WG wide state (I don't see how to avoid redundancy here)
+    int cu_id;
+    uint32_t wg_id;
+    uint32_t barrier_id;
+
+    // 64 bit values (these values depend on the wavefront size)
+    // masks
+    uint64_t init_mask;
+    uint64_t exec_mask;
+
+    // private memory;
+    Addr privBase;
+    Addr spillBase;
+
+    LdsChunk *ldsChunk;
+
+    /*
+     * Kernel wide state
+     * This is a hack. This state should be moved through simulated memory
+     * during a yield. Though not much is being used here, so it's probably
+     * probably not a big deal.
+     *
+     * Just to add to this comment... The ndr is derived from simulated
+     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
+     * for a kernel launch. So in theory the runtime should be able to keep
+     * that state around. Then a WF can reference it upon restart to derive
+     * kernel wide state. The runtime can deallocate the state when the
+     * kernel completes.
+     */
+    NDRange *ndr;
+};
+
+// State that needs to be passed between the simulation and simulated app, a
+// pointer to this struct can be passed through the depends field in the
+// HsaQueueEntry struct
+struct HostState
+{
+    // cl_event* has original HsaQueueEntry for init
+    uint64_t event;
+};
+
+// Total number of HSA queues
+static const int HSAQ_NQUEUES = 8;
+
+// These values will eventually live in memory mapped registers
+// and be settable by the kernel mode driver.
+
+// Number of entries in each HSA queue
+static const int HSAQ_SIZE = 64;
+// Address of first HSA queue index
+static const int HSAQ_INDX_BASE = 0x10000ll;
+// Address of first HSA queue
+static const int HSAQ_BASE = 0x11000ll;
+// Suggested start of HSA code
+static const int HSA_CODE_BASE = 0x18000ll;
+
+// These are shortcuts for deriving the address of a specific
+// HSA queue or queue index
+#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
+#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
+#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
+#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
+#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
+
+/*
+ * Example code for writing to a queue
+ *
+ * void
+ * ToQueue(int n,struct fsaQueue *val)
+ * {
+ *     int wi = *(int*)HSAQ_WI(n);
+ *     int ri = *(int*)HSAQ_RI(n);
+ *     int ci = *(int*)HSAQ_CI(n);
+ *
+ *     if (ci - ri < HSAQ_SIZE) {
+ *         (*(int*)HSAQ_CI(n))++;
+ *         *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
+ *         (*(int*)HSAQ_WI(n))++;
+ *     }
+ * }
+ */
+
+#endif // __Q_STRUCT_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc
new file mode 100644
index 000000000..5d3591901
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/rr_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+RRSchedulingPolicy::chooseWave()
+{
+    Wavefront *selectedWave = nullptr;
+
+    // Check to make sure ready list had atleast one schedulable wave
+    if (scheduleList->size()) {
+        // For RR policy, select the wave which is at the
+        // front of the list. The selected wave is popped
+        // out from the schedule list immediately after selection
+        // to avoid starvation. It is the responsibility of the
+        // module invoking the RR scheduler to make surei scheduling
+        // eligible waves are added to the back of the schedule
+        // list
+        selectedWave = scheduleList->front();
+        scheduleList->erase(scheduleList->begin() + 0);
+    } else {
+        panic("Empty ready list");
+    }
+
+    return selectedWave;
+}
+
+void
+RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+    scheduleList = list;
+}
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
new file mode 100644
index 000000000..780f294aa
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __RR_SCHEDULING_POLICY_HH__
+#define __RR_SCHEDULING_POLICY_HH__
+
+#include <inttypes.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Round-Robin pick among the list of ready waves
+class RRSchedulingPolicy
+{
+  public:
+    RRSchedulingPolicy() : scheduleList(nullptr) { }
+
+    Wavefront* chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    // List of waves which are participating in scheduling.
+    // This scheduler selects one wave from this list based on
+    // round robin policy
+    std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __RR_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
new file mode 100644
index 000000000..068136026
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/schedule_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        Scheduler newScheduler(p);
+        scheduler.push_back(newScheduler);
+    }
+}
+
+ScheduleStage::~ScheduleStage()
+{
+    scheduler.clear();
+    waveStatusList.clear();
+}
+
+void
+ScheduleStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScheduleStage";
+
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+        scheduler[j].bindList(&computeUnit->readyList[j]);
+    }
+
+    for (int j = 0; j < numSIMDs; ++j) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+    }
+
+    dispatchList = &computeUnit->dispatchList;
+}
+
+void
+ScheduleStage::arbitrate()
+{
+    // iterate over all Memory pipelines
+    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
+        if (dispatchList->at(j).first) {
+            Wavefront *waveToMemPipe = dispatchList->at(j).first;
+            // iterate over all execution pipelines
+            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
+                if ((i != j) && (dispatchList->at(i).first)) {
+                    Wavefront *waveToExePipe = dispatchList->at(i).first;
+                    // if the two selected wavefronts are mapped to the same
+                    // SIMD unit then they share the VRF
+                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
+                        int simdId = waveToMemPipe->simdId;
+                        // Read VRF port arbitration:
+                        // If there are read VRF port conflicts between the
+                        // a memory and another instruction we drop the other
+                        // instruction. We don't need to check for write VRF
+                        // port conflicts because the memory instruction either
+                        // does not need to write to the VRF (store) or will
+                        // write to the VRF when the data comes back (load) in
+                        // which case the arbiter of the memory pipes will
+                        // resolve any conflicts
+                        if (computeUnit->vrf[simdId]->
+                            isReadConflict(waveToMemPipe->wfSlotId,
+                            waveToExePipe->wfSlotId)) {
+                            // FIXME: The "second" member variable is never
+                            // used in the model. I am setting it to READY
+                            // simply to follow the protocol of setting it
+                            // when the WF has an instruction ready to issue
+                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
+                                                    .second = READY;
+
+                            dispatchList->at(i).first = nullptr;
+                            dispatchList->at(i).second = EMPTY;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void
+ScheduleStage::exec()
+{
+    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+         uint32_t readyListSize = computeUnit->readyList[j].size();
+
+         // If no wave is ready to be scheduled on the execution resource
+         // then skip scheduling for this execution resource
+         if (!readyListSize) {
+             continue;
+         }
+
+         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
+         dispatchList->at(j).first = waveToBeDispatched;
+         waveToBeDispatched->updateResources();
+         dispatchList->at(j).second = FILLED;
+
+         waveStatusList[waveToBeDispatched->simdId]->at(
+                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+
+         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+    }
+    // arbitrate over all shared resources among instructions being issued
+    // simultaneously
+    arbitrate();
+}
+
+void
+ScheduleStage::regStats()
+{
+}
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
new file mode 100644
index 000000000..26eb9a25b
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.hh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULE_STAGE_HH__
+#define __SCHEDULE_STAGE_HH__
+
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/scheduler.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+// Schedule or execution arbitration stage.
+// From the pool of ready waves in the ready list,
+// one wave is selected for each execution resource.
+// The selection is made based on a scheduling policy
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+class ScheduleStage
+{
+  public:
+    ScheduleStage(const ComputeUnitParams *params);
+    ~ScheduleStage();
+    void init(ComputeUnit *cu);
+    void exec();
+    void arbitrate();
+    // Stats related variables and methods
+    std::string name() { return _name; }
+    void regStats();
+
+  private:
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+
+    // Each execution resource will have its own
+    // scheduler and a dispatch list
+    std::vector<Scheduler> scheduler;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+
+    std::string _name;
+};
+
+#endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc
new file mode 100644
index 000000000..1cd0bfe55
--- /dev/null
+++ b/src/gpu-compute/scheduler.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scheduler.hh"
+
+Scheduler::Scheduler(const ComputeUnitParams *p)
+{
+    if (p->execPolicy  == "OLDEST-FIRST") {
+        schedPolicy = SCHED_POLICY::OF_POLICY;
+    } else if (p->execPolicy  == "ROUND-ROBIN") {
+        schedPolicy = SCHED_POLICY::RR_POLICY;
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+Wavefront*
+Scheduler::chooseWave()
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        return OFSchedPolicy.chooseWave();
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        return RRSchedPolicy.chooseWave();
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
+
+void
+Scheduler::bindList(std::vector<Wavefront*> *list)
+{
+    if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+        OFSchedPolicy.bindList(list);
+    } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+        RRSchedPolicy.bindList(list);
+    } else {
+        fatal("Unimplemented scheduling policy");
+    }
+}
diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh
new file mode 100644
index 000000000..148ec9425
--- /dev/null
+++ b/src/gpu-compute/scheduler.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULER_HH__
+#define __SCHEDULER_HH__
+
+#include "gpu-compute/of_scheduling_policy.hh"
+#include "gpu-compute/rr_scheduling_policy.hh"
+#include "gpu-compute/scheduling_policy.hh"
+#include "params/ComputeUnit.hh"
+
+enum SCHED_POLICY
+{
+    OF_POLICY = 0,
+    RR_POLICY
+};
+
+class Scheduler
+{
+  public:
+    Scheduler(const ComputeUnitParams *params);
+    Wavefront *chooseWave();
+    void bindList(std::vector<Wavefront*> *list);
+
+  private:
+    SCHED_POLICY schedPolicy;
+    SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
+    SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
+};
+
+#endif // __SCHEDULER_HH__
diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh
new file mode 100644
index 000000000..b5e923c62
--- /dev/null
+++ b/src/gpu-compute/scheduling_policy.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULING_POLICY_HH__
+#define __SCHEDULING_POLICY_HH__
+
+#include <vector>
+
+template<typename Impl>
+class SchedulingPolicy
+{
+  public:
+    Wavefront* chooseWave() { return policyImpl.chooseWave(); }
+
+    void
+    bindList(std::vector<Wavefront*> *list)
+    {
+        return policyImpl.bindList(list);
+    }
+
+  private:
+    Impl policyImpl;
+};
+
+#endif // __SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
new file mode 100644
index 000000000..0d856a9b0
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
+    : numSIMDs(p->num_SIMDs),
+      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+      numGlbMemPipes(p->num_global_mem_pipes),
+      numShrMemPipes(p->num_shared_mem_pipes),
+      vectorAluInstAvail(nullptr),
+      lastGlbMemSimd(-1),
+      lastShrMemSimd(-1), glbMemInstAvail(nullptr),
+      shrMemInstAvail(nullptr)
+{
+}
+
+ScoreboardCheckStage::~ScoreboardCheckStage()
+{
+    readyList.clear();
+    waveStatusList.clear();
+    shrMemInstAvail = nullptr;
+    glbMemInstAvail = nullptr;
+}
+
+void
+ScoreboardCheckStage::init(ComputeUnit *cu)
+{
+    computeUnit = cu;
+    _name = computeUnit->name() + ".ScoreboardCheckStage";
+
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList.push_back(&computeUnit->readyList[unitId]);
+    }
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
+    }
+
+    vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
+    glbMemInstAvail= &computeUnit->glbMemInstAvail;
+    shrMemInstAvail= &computeUnit->shrMemInstAvail;
+}
+
+void
+ScoreboardCheckStage::initStatistics()
+{
+    lastGlbMemSimd = -1;
+    lastShrMemSimd = -1;
+    *glbMemInstAvail = 0;
+    *shrMemInstAvail = 0;
+
+    for (int unitId = 0; unitId < numSIMDs; ++unitId)
+        vectorAluInstAvail->at(unitId) = false;
+}
+
+void
+ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+{
+    if (curWave->instructionBuffer.empty())
+        return;
+
+    // track which vector SIMD unit has at least one WV with a vector
+    // ALU as the oldest instruction in its Instruction buffer
+    vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
+                                     curWave->isOldestInstALU();
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector Global memory instruction as the oldest instruction
+    // in its Instruction buffer
+    if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
+         curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
+        *glbMemInstAvail <= 1) {
+        (*glbMemInstAvail)++;
+        lastGlbMemSimd = unitId;
+    }
+
+    // track how many vector SIMD units have at least one WV with a
+    // vector shared memory (LDS) instruction as the oldest instruction
+    // in its Instruction buffer
+    // TODO: parametrize the limit of the LDS units
+    if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
+        lastShrMemSimd != unitId) {
+        (*shrMemInstAvail)++;
+        lastShrMemSimd = unitId;
+    }
+}
+
+void
+ScoreboardCheckStage::exec()
+{
+    initStatistics();
+
+    // reset the ready list for all execution units; it will be
+    // constructed every cycle since resource availability may change
+    for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+        readyList[unitId]->clear();
+    }
+
+    // iterate over the Wavefronts of all SIMD units
+    for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+        for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+            // reset the ready status of each wavefront
+            waveStatusList[unitId]->at(wvId).second = BLOCKED;
+            Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
+            collectStatistics(curWave, unitId);
+
+            if (curWave->ready(Wavefront::I_ALU)) {
+                readyList[unitId]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_GLOBAL)) {
+                if (computeUnit->cedeSIMD(unitId, wvId)) {
+                    continue;
+                }
+
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_SHARED)) {
+                readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_FLAT)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            } else if (curWave->ready(Wavefront::I_PRIVATE)) {
+                readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+                waveStatusList[unitId]->at(wvId).second = READY;
+            }
+        }
+    }
+}
+
+void
+ScoreboardCheckStage::regStats()
+{
+}
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
new file mode 100644
index 000000000..099597afb
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCOREBOARD_CHECK_STAGE_HH__
+#define __SCOREBOARD_CHECK_STAGE_HH__
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+enum WAVE_STATUS
+{
+    BLOCKED = 0,
+    READY
+};
+
+/*
+ * Scoreboard check stage.
+ * All wavefronts are analyzed to see if they are ready
+ * to be executed this cycle. Both structural and data
+ * hazards are considered while marking a wave "ready"
+ * for execution. After analysis, the ready waves are
+ * added to readyList.
+ */
+class ScoreboardCheckStage
+{
+  public:
+    ScoreboardCheckStage(const ComputeUnitParams* params);
+    ~ScoreboardCheckStage();
+    void init(ComputeUnit *cu);
+    void exec();
+
+    // Stats related variables and methods
+    const std::string& name() const { return _name; }
+    void regStats();
+
+  private:
+    void collectStatistics(Wavefront *curWave, int unitId);
+    void initStatistics();
+    ComputeUnit *computeUnit;
+    uint32_t numSIMDs;
+    uint32_t numMemUnits;
+    uint32_t numGlbMemPipes;
+    uint32_t numShrMemPipes;
+
+    // flag per vector SIMD unit that is set when there is at least one
+    // WF that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer
+    std::vector<bool> *vectorAluInstAvail;
+    int lastGlbMemSimd;
+    int lastShrMemSimd;
+
+    int *glbMemInstAvail;
+    int *shrMemInstAvail;
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list
+    std::vector<std::vector<Wavefront*>*> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+        waveStatusList;
+
+    std::string _name;
+};
+
+#endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+    box_tick_cnt(0), start_tick_cnt(0)
+{
+
+    cuList.resize(n_cu);
+
+    for (int i = 0; i < n_cu; ++i) {
+        cuList[i] = p->CUs[i];
+        assert(i == cuList[i]->cu_id);
+        cuList[i]->shader = this;
+    }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+    Addr start;
+
+    // round up length to the next page
+    length = roundUp(length, TheISA::PageBytes);
+
+    if (X86Linux64::mmapGrowsDown()) {
+        DPRINTF(HSAIL, "GROWS DOWN");
+        start = gpuTc->getProcessPtr()->mmap_end -length;
+        gpuTc->getProcessPtr()->mmap_end = start;
+    } else {
+        DPRINTF(HSAIL, "GROWS UP");
+        start = gpuTc->getProcessPtr()->mmap_end;
+        gpuTc->getProcessPtr()->mmap_end += length;
+
+        // assertion to make sure we don't overwrite the stack (it grows down)
+        assert(gpuTc->getProcessPtr()->mmap_end <
+                gpuTc->getProcessPtr()->stack_base -
+                gpuTc->getProcessPtr()->max_stack_size);
+
+    }
+
+    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+    gpuTc->getProcessPtr()->allocateMem(start,length);
+
+    return start;
+}
+
+void
+Shader::init()
+{
+    // grab the threadContext of the thread running on the CPU
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(0);
+    assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+    for (int j = 0; j < n_cu; ++j)
+        delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+    // thread context of the thread which dispatched work
+    assert(cpuPointer);
+    gpuTc = cpuPointer->getContext(tid);
+    assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+    if (cpuPointer == cpu) {
+        if (gpuTc->status() == ThreadContext::Suspended)
+            cpu->activateContext(gpuTc->threadId());
+    } else {
+        //Make sure both dispatcher and shader are trying to
+        //wakeup same host. Hack here to enable kernel launch
+        //from multiple CPUs
+        panic("Dispatcher wants to wakeup a different host");
+    }
+}
+
+Shader*
+ShaderParams::create()
+{
+    return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+    tick_cnt = curTick();
+    box_tick_cnt = curTick() - start_tick_cnt;
+
+    // apply any scheduled adds
+    for (int i = 0; i < sa_n; ++i) {
+        if (sa_when[i] <= tick_cnt) {
+            *sa_val[i] += sa_x[i];
+            sa_val.erase(sa_val.begin() + i);
+            sa_x.erase(sa_x.begin() + i);
+            sa_when.erase(sa_when.begin() + i);
+            --sa_n;
+            --i;
+        }
+    }
+
+    // clock all of the cu's
+    for (int i = 0; i < n_cu; ++i)
+        cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+    bool scheduledSomething = false;
+    int cuCount = 0;
+    int curCu = nextSchedCu;
+
+    while (cuCount < n_cu) {
+        //Every time we try a CU, update nextSchedCu
+        nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+        // dispatch workgroup iff the following two conditions are met:
+        // (a) wg_rem is true - there are unassigned workgroups in the grid
+        // (b) there are enough free slots in cu cuList[i] for this wg
+        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+            scheduledSomething = true;
+            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+            // ticks() member function translates cycles to simulation ticks.
+            if (!tickEvent.scheduled()) {
+                schedule(tickEvent, curTick() + this->ticks(1));
+            }
+
+            cuList[curCu]->StartWorkgroup(ndr);
+            ndr->wgId[0]++;
+            ndr->globalWgId++;
+            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+                ndr->wgId[0] = 0;
+                ndr->wgId[1]++;
+
+                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+                    ndr->wgId[1] = 0;
+                    ndr->wgId[2]++;
+
+                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+                        ndr->wg_disp_rem = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        ++cuCount;
+        curCu = nextSchedCu;
+    }
+
+    return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+    dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                           bool suppress_func_errors, int cu_id)
+{
+    unsigned block_size = RubySystem::getBlockSizeBytes();
+    unsigned size = req->getSize();
+
+    Addr tmp_addr;
+    BaseTLB::Mode trans_mode;
+
+    if (cmd == MemCmd::ReadReq) {
+        trans_mode = BaseTLB::Read;
+    } else if (cmd == MemCmd::WriteReq) {
+        trans_mode = BaseTLB::Write;
+    } else {
+        fatal("unexcepted MemCmd\n");
+    }
+
+    tmp_addr = req->getVaddr();
+    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+    // Misaligned access
+    if (split_addr > tmp_addr) {
+        RequestPtr req1, req2;
+        req->splitOnVaddr(split_addr, req1, req2);
+
+
+        PacketPtr pkt1 = new Packet(req2, cmd);
+        PacketPtr pkt2 = new Packet(req1, cmd);
+
+        functionalTLBAccess(pkt1, cu_id, trans_mode);
+        functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+        new_pkt1->dataStatic(data);
+        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+        if (suppress_func_errors) {
+            new_pkt1->setSuppressFuncError();
+            new_pkt2->setSuppressFuncError();
+        }
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+        cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+        delete new_pkt1;
+        delete new_pkt2;
+        delete pkt1;
+        delete pkt2;
+    } else {
+        PacketPtr pkt = new Packet(req, cmd);
+        functionalTLBAccess(pkt, cu_id, trans_mode);
+        PacketPtr new_pkt = new Packet(pkt->req, cmd);
+        new_pkt->dataStatic(data);
+
+        if (suppress_func_errors) {
+            new_pkt->setSuppressFuncError();
+        };
+
+        // fixme: this should be cuList[cu_id] if cu_id != n_cu
+        // The latter requires a memPort in the dispatcher
+        cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+        delete new_pkt;
+        delete pkt;
+    }
+}
+
+bool
+Shader::busy()
+{
+    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+        if (!cuList[i_cu]->isDone()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+    sa_val.push_back(val);
+    sa_when.push_back(tick_cnt + when);
+    sa_x.push_back(x);
+    ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+    : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+    if (shader->busy()) {
+        shader->exec();
+        shader->schedule(this, curTick() + shader->ticks(1));
+    }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+    return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                  MemCmd cmd, bool suppress_func_errors)
+{
+    uint8_t *data_buf = (uint8_t*)ptr;
+
+    for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+         !gen.done(); gen.next()) {
+        Request *req = new Request(0, gen.addr(), gen.size(), 0,
+                                   cuList[0]->masterId(), 0, 0, 0);
+
+        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+        data_buf += gen.size();
+        delete req;
+    }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                 bool suppress_func_errors)
+{
+    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+              suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+    // update senderState. Need to know the gpuTc and the TLB mode
+    pkt->senderState =
+        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+    if (cu_id == n_cu) {
+        dispatcher->tlbPort->sendFunctional(pkt);
+    } else {
+        // even when the perLaneTLB flag is turned on
+        // it's ok tp send all accesses through lane 0
+        // since the lane # is not known here,
+        // This isn't important since these are functional accesses.
+        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+    }
+
+    /* safe_cast the senderState */
+    TheISA::GpuTLB::TranslationState *sender_state =
+               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    delete sender_state->tlbEntry;
+    delete pkt->senderState;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
new file mode 100644
index 000000000..91ea8aae0
--- /dev/null
+++ b/src/gpu-compute/shader.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __SHADER_HH__
+#define __SHADER_HH__
+
+#include <functional>
+#include <string>
+
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "base/types.hh"
+#include "cpu/simple/atomic.hh"
+#include "cpu/simple/timing.hh"
+#include "cpu/simple_thread.hh"
+#include "cpu/thread_context.hh"
+#include "cpu/thread_state.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/page_table.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/Shader.hh"
+#include "sim/faults.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class GpuDispatcher;
+
+namespace TheISA
+{
+    class GpuTLB;
+}
+
+static const int LDS_SIZE = 65536;
+
+// Class Shader: This describes a single shader instance. Most
+// configurations will only have a single shader.
+
+class Shader : public SimObject
+{
+  protected:
+      // Shader's clock period in terms of number of ticks of curTime,
+      // aka global simulation clock
+      Tick clock;
+
+  public:
+    typedef ShaderParams Params;
+    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
+
+    // clock related functions ; maps to-and-from
+    // Simulation ticks and shader clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+
+    Tick ticks(int numCycles) const { return  (Tick)clock * numCycles; }
+
+    Tick getClock() const { return clock; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+
+    SimpleThread *cpuThread;
+    ThreadContext *gpuTc;
+    BaseCPU *cpuPointer;
+
+    class TickEvent : public Event
+    {
+      private:
+        Shader *shader;
+
+      public:
+        TickEvent(Shader*);
+        void process();
+        const char* description() const;
+    };
+
+    TickEvent tickEvent;
+
+    // is this simulation going to be timing mode in the memory?
+    bool timingSim;
+    hsail_mode_e hsail_mode;
+
+    // If set, issue acq packet @ kernel launch
+    int impl_kern_boundary_sync;
+    // If set, generate a separate packet for acquire/release on
+    // ld_acquire/st_release/atomic operations
+    int separate_acquire_release;
+    // If set, fetch returns may be coissued with instructions
+    int coissue_return;
+    // If set, always dump all 64 gprs to trace
+    int trace_vgpr_all;
+    // Number of cu units in the shader
+    int n_cu;
+    // Number of wavefront slots per cu
+    int n_wf;
+    // The size of global memory
+    int globalMemSize;
+
+    /*
+     * Bytes/work-item for call instruction
+     * The number of arguments for an hsail function will
+     * vary. We simply determine the maximum # of arguments
+     * required by any hsail function up front before the
+     * simulation (during parsing of the Brig) and record
+     * that number here.
+     */
+    int funcargs_size;
+
+    // Tracks CU that rr dispatcher should attempt scheduling
+    int nextSchedCu;
+
+    // Size of scheduled add queue
+    uint32_t sa_n;
+
+    // Pointer to value to be increments
+    std::vector<uint32_t*> sa_val;
+    // When to do the increment
+    std::vector<uint64_t> sa_when;
+    // Amount to increment by
+    std::vector<int32_t> sa_x;
+
+    // List of Compute Units (CU's)
+    std::vector<ComputeUnit*> cuList;
+
+    uint64_t tick_cnt;
+    uint64_t box_tick_cnt;
+    uint64_t start_tick_cnt;
+
+    GpuDispatcher *dispatcher;
+
+    Shader(const Params *p);
+    ~Shader();
+    virtual void init();
+
+    // Run shader
+    void exec();
+
+    // Check to see if shader is busy
+    bool busy();
+
+    // Schedule a 32-bit value to be incremented some time in the future
+    void ScheduleAdd(uint32_t *val, Tick when, int x);
+    bool processTimingPacket(PacketPtr pkt);
+
+    void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+                   MemCmd cmd, bool suppress_func_errors);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                 bool suppress_func_errors);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+    void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+                  bool suppress_func_errors);
+
+    void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+                            bool suppress_func_errors, int cu_id);
+
+    void
+    registerCU(int cu_id, ComputeUnit *compute_unit)
+    {
+        cuList[cu_id] = compute_unit;
+    }
+
+    void handshake(GpuDispatcher *dispatcher);
+    bool dispatch_workgroups(NDRange *ndr);
+    Addr mmap(int length);
+    void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
+    void updateThreadContext(int tid);
+    void hostWakeUp(BaseCPU *cpu);
+};
+
+#endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
new file mode 100644
index 000000000..0e35ab9cc
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/simple_pool_manager.hh"
+
+#include "base/misc.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+SimplePoolManager::minAllocatedElements(uint32_t size)
+{
+    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+SimplePoolManager::printRegion()
+{
+    std::string _cout;
+    if (_reservedGroups == 0)
+        _cout = "VRF is empty\n";
+    else if (_reservedGroups > 0) {
+        uint32_t reservedEntries = _reservedGroups * _regionSize;
+        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+    }
+
+    return _cout;
+}
+
+bool
+SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    assert(numRegions * minAllocatedElements(size) <= poolSize());
+
+    return _reservedGroups == 0;
+}
+
+void
+SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
+{
+    assert(_reservedGroups > 0);
+    --_reservedGroups;
+
+    if (!_reservedGroups)
+        _nxtFreeIdx = 0;
+}
+
+uint32_t
+SimplePoolManager::allocateRegion(const uint32_t size,
+                                  uint32_t *reservedPoolSize)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    uint32_t startIdx = _nxtFreeIdx;
+    _nxtFreeIdx += actualSize;
+    _regionSize = actualSize;
+    assert(_nxtFreeIdx < poolSize());
+    *reservedPoolSize = actualSize;
+    ++_reservedGroups;
+
+    return startIdx;
+}
+
+uint32_t
+SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
new file mode 100644
index 000000000..1d4174da8
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __SIMPLE_POOL_MANAGER_HH__
+#define __SIMPLE_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+
+// Simple Pool Manager: allows one region per pool. No region merging is
+// supported.
+class SimplePoolManager : public PoolManager
+{
+  public:
+    SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
+        : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+          _reservedGroups(0)
+    {
+    }
+
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion();
+    bool canAllocate(uint32_t numRegions, uint32_t size);
+    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    uint8_t _nxtFreeIdx;
+    // number of groups that reserve a region
+    uint32_t _reservedGroups;
+};
+
+#endif // __SIMPLE_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+    clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+    coalescingWindow(p->coalescingWindow),
+    disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+    cleanupEvent(this)
+{
+    // create the slave ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+
+    // create the master ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "slave") {
+        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+        }
+
+        return *cpuSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+    }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "master") {
+        if (idx >= static_cast<PortID>(memSidePort.size())) {
+            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+        }
+
+        return *memSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+    }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+    if (disableCoalescing)
+        return false;
+
+    TheISA::GpuTLB::TranslationState *incoming_state =
+      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+    TheISA::GpuTLB::TranslationState *coalesced_state =
+     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+    // Rule 1: Coalesce requests only if they
+    // fall within the same virtual page
+    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+                                             TheISA::PageBytes);
+
+    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+                                              TheISA::PageBytes);
+
+    if (incoming_virt_page_addr != coalesced_virt_page_addr)
+        return false;
+
+    //* Rule 2: Coalesce requests only if they
+    // share a TLB Mode, i.e. they are both read
+    // or write requests.
+    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+    if (incoming_mode != coalesced_mode)
+        return false;
+
+    // when we can coalesce a packet update the reqCnt
+    // that is the number of packets represented by
+    // this coalesced packet
+    if (!incoming_state->prefetch)
+        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+    return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+    assert(tlb_entry);
+    Addr first_entry_vaddr = tlb_entry->vaddr;
+    Addr first_entry_paddr = tlb_entry->paddr;
+    int page_size = tlb_entry->size();
+    bool uncacheable = tlb_entry->uncacheable;
+    int first_hit_level = sender_state->hitLevel;
+    bool valid = tlb_entry->valid;
+
+    // Get the physical page address of the translated request
+    // Using the page_size specified in the TLBEntry allows us
+    // to support different page sizes.
+    Addr phys_page_paddr = pkt->req->getPaddr();
+    phys_page_paddr &= ~(page_size - 1);
+
+    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+        TheISA::GpuTLB::TranslationState *sender_state =
+            safe_cast<TheISA::GpuTLB::TranslationState*>(
+                    local_pkt->senderState);
+
+        // we are sending the packet back, so pop the reqCnt associated
+        // with this level in the TLB hiearchy
+        if (!sender_state->prefetch)
+            sender_state->reqCnt.pop_back();
+
+        /*
+         * Only the first packet from this coalesced request has been
+         * translated. Grab the translated phys. page addr and update the
+         * physical addresses of the remaining packets with the appropriate
+         * page offsets.
+         */
+        if (i) {
+            Addr paddr = phys_page_paddr;
+            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+            local_pkt->req->setPaddr(paddr);
+
+            if (uncacheable)
+                local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+            // update senderState->tlbEntry, so we can insert
+            // the correct TLBEentry in the TLBs above.
+            sender_state->tlbEntry =
+                new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+                                        valid);
+
+            // update the hitLevel for all uncoalesced reqs
+            // so that each packet knows where it hit
+            // (used for statistics in the CUs)
+            sender_state->hitLevel = first_hit_level;
+        }
+
+        SlavePort *return_port = sender_state->ports.back();
+        sender_state->ports.pop_back();
+
+        // Translation is done - Convert to a response pkt if necessary and
+        // send the translation back
+        if (local_pkt->isRequest()) {
+            local_pkt->makeTimingResponse();
+        }
+
+        return_port->sendTimingResp(local_pkt);
+    }
+
+    // schedule clean up for end of this cycle
+    // This is a maximum priority event and must be on
+    // the same cycle as GPUTLB cleanup event to prevent
+    // race conditions with an IssueProbeEvent caused by
+    // MemSidePort::recvReqRetry
+    cleanupQueue.push(virt_page_addr);
+
+    if (!cleanupEvent.scheduled())
+        schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+    // first packet of a coalesced request
+    PacketPtr first_packet = nullptr;
+    // true if we are able to do coalescing
+    bool didCoalesce = false;
+    // number of coalesced reqs for a given window
+    int coalescedReq_cnt = 0;
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // push back the port to remember the path back
+    sender_state->ports.push_back(this);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats) {
+        // if reqCnt is empty then this packet does not represent
+        // multiple uncoalesced reqs(pkts) but just a single pkt.
+        // If it does though then the reqCnt for each level in the
+        // hierarchy accumulates the total number of reqs this packet
+        // represents
+        int req_cnt = 1;
+
+        if (!sender_state->reqCnt.empty())
+            req_cnt = sender_state->reqCnt.back();
+
+        sender_state->reqCnt.push_back(req_cnt);
+
+        // update statistics
+        coalescer->uncoalescedAccesses++;
+        req_cnt = sender_state->reqCnt.back();
+        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+        coalescer->queuingCycles -= (curTick() * req_cnt);
+        coalescer->localqueuingCycles -= curTick();
+    }
+
+    // FIXME if you want to coalesce not based on the issueTime
+    // of the packets (i.e., from the compute unit's perspective)
+    // but based on when they reached this coalescer then
+    // remove the following if statement and use curTick() or
+    // coalescingWindow for the tick_index.
+    if (!sender_state->issueTime)
+       sender_state->issueTime = curTick();
+
+    // The tick index is used as a key to the coalescerFIFO hashmap.
+    // It is shared by all candidates that fall within the
+    // given coalescingWindow.
+    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+    if (coalescer->coalescerFIFO.count(tick_index)) {
+        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+    }
+
+    // see if we can coalesce the incoming pkt with another
+    // coalesced request with the same tick_index
+    for (int i = 0; i < coalescedReq_cnt; ++i) {
+        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+        if (coalescer->canCoalesce(pkt, first_packet)) {
+            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+                    i, tick_index,
+                    coalescer->coalescerFIFO[tick_index][i].size());
+
+            didCoalesce = true;
+            break;
+        }
+    }
+
+    // if this is the first request for this tick_index
+    // or we did not manage to coalesce, update stats
+    // and make necessary allocations.
+    if (!coalescedReq_cnt || !didCoalesce) {
+        if (update_stats)
+            coalescer->coalescedAccesses++;
+
+        std::vector<PacketPtr> new_array;
+        new_array.push_back(pkt);
+        coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+                "push\n", tick_index,
+                coalescer->coalescerFIFO[tick_index].size());
+    }
+
+    //schedule probeTLBEvent next cycle to send the
+    //coalesced requests to the TLB
+    if (!coalescer->probeTLBEvent.scheduled()) {
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+    }
+
+    return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+    assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats)
+        coalescer->uncoalescedAccesses++;
+
+    // If there is a pending timing request for this virtual address
+    // print a warning message. This is a temporary caveat of
+    // the current simulator where atomic and timing requests can
+    // coexist. FIXME remove this check/warning in the future.
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+    if (map_count) {
+        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+                "req. pending\n", virt_page_addr);
+    }
+
+    coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+    // currently not checked by the master
+    AddrRangeList ranges;
+
+    return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+    // a translation completed and returned
+    coalescer->updatePhysAddresses(pkt);
+
+    return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+    //we've receeived a retry. Schedule a probeTLBEvent
+    if (!coalescer->probeTLBEvent.scheduled())
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+    : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+    return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+    // number of TLB probes sent so far
+    int sent_probes = 0;
+    // rejected denotes a blocking event
+    bool rejected = false;
+
+    // It is set to true either when the recvTiming of the TLB below
+    // returns false or when there is another outstanding request for the
+    // same virt. page.
+
+    DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+    for (auto iter = coalescer->coalescerFIFO.begin();
+         iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+        int coalescedReq_cnt = iter->second.size();
+        int i = 0;
+        int vector_index = 0;
+
+        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+               coalescedReq_cnt, iter->first);
+
+        while (i < coalescedReq_cnt) {
+            ++i;
+            PacketPtr first_packet = iter->second[vector_index][0];
+
+            // compute virtual page address for this request
+            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+                    TheISA::PageBytes);
+
+            // is there another outstanding request for the same page addr?
+            int pending_reqs =
+                coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+            if (pending_reqs) {
+                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+                        "page %#x\n", virt_page_addr);
+
+                ++vector_index;
+                rejected = true;
+
+                continue;
+            }
+
+            // send the coalesced request for virt_page_addr
+            if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                       virt_page_addr);
+
+                // No need for a retries queue since we are already buffering
+                // the coalesced request in coalescerFIFO.
+                rejected = true;
+                ++vector_index;
+            } else {
+                TheISA::GpuTLB::TranslationState *tmp_sender_state =
+                    safe_cast<TheISA::GpuTLB::TranslationState*>
+                    (first_packet->senderState);
+
+                bool update_stats = !tmp_sender_state->prefetch;
+
+                if (update_stats) {
+                    // req_cnt is total number of packets represented
+                    // by the one we just sent counting all the way from
+                    // the top of TLB hiearchy (i.e., from the CU)
+                    int req_cnt = tmp_sender_state->reqCnt.back();
+                    coalescer->queuingCycles += (curTick() * req_cnt);
+
+                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+                            coalescer->name(), req_cnt);
+
+                    // pkt_cnt is number of packets we coalesced into the one
+                    // we just sent but only at this coalescer level
+                    int pkt_cnt = iter->second[vector_index].size();
+                    coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+                }
+
+                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+                       virt_page_addr);
+
+                //copy coalescedReq to issuedTranslationsTable
+                coalescer->issuedTranslationsTable[virt_page_addr]
+                    = iter->second[vector_index];
+
+                //erase the entry of this coalesced req
+                iter->second.erase(iter->second.begin() + vector_index);
+
+                if (iter->second.empty())
+                    assert(i == coalescedReq_cnt);
+
+                sent_probes++;
+                if (sent_probes == coalescer->TLBProbesPerCycle)
+                   return;
+            }
+        }
+
+        //if there are no more coalesced reqs for this tick_index
+        //erase the hash_map with the first iterator
+        if (iter->second.empty()) {
+            coalescer->coalescerFIFO.erase(iter++);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+    : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+    return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+    while (!coalescer->cleanupQueue.empty()) {
+        Addr cleanup_addr = coalescer->cleanupQueue.front();
+        coalescer->cleanupQueue.pop();
+        coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+                cleanup_addr);
+    }
+}
+
+void
+TLBCoalescer::regStats()
+{
+    uncoalescedAccesses
+        .name(name() + ".uncoalesced_accesses")
+        .desc("Number of uncoalesced TLB accesses")
+        ;
+
+    coalescedAccesses
+        .name(name() + ".coalesced_accesses")
+        .desc("Number of coalesced TLB accesses")
+        ;
+
+    queuingCycles
+        .name(name() + ".queuing_cycles")
+        .desc("Number of cycles spent in queue")
+        ;
+
+    localqueuingCycles
+        .name(name() + ".local_queuing_cycles")
+        .desc("Number of cycles spent in queue for all incoming reqs")
+        ;
+
+    localLatency
+        .name(name() + ".local_latency")
+        .desc("Avg. latency over all incoming pkts")
+        ;
+
+    localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+    return new TLBCoalescer(this);
+}
+
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
new file mode 100644
index 000000000..09210148b
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __TLB_COALESCER_HH__
+#define __TLB_COALESCER_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/TLBCoalescer.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+/**
+ * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
+ * each TLB. It receives packets and issues coalesced requests to the
+ * TLB below it. It controls how requests are coalesced (the rules)
+ * and the permitted number of TLB probes per cycle (i.e., how many
+ * coalesced requests it feeds the TLB per cycle).
+ */
+class TLBCoalescer : public MemObject
+{
+   protected:
+    // TLB clock: will inherit clock from shader's clock period in terms
+    // of nuber of ticks of curTime (aka global simulation clock)
+    // The assignment of TLB clock from shader clock is done in the
+    // python config files.
+    int clock;
+
+  public:
+    typedef TLBCoalescerParams Params;
+    TLBCoalescer(const Params *p);
+    ~TLBCoalescer() { }
+
+    // Number of TLB probes per cycle. Parameterizable - default 2.
+    int TLBProbesPerCycle;
+
+    // Consider coalescing across that many ticks.
+    // Paraemterizable - default 1.
+    int coalescingWindow;
+
+    // Each coalesced request consists of multiple packets
+    // that all fall within the same virtual page
+    typedef std::vector<PacketPtr> coalescedReq;
+
+    // disables coalescing when true
+    bool disableCoalescing;
+
+    /*
+     * This is a hash map with <tick_index> as a key.
+     * It contains a vector of coalescedReqs per <tick_index>.
+     * Requests are buffered here until they can be issued to
+     * the TLB, at which point they are copied to the
+     * issuedTranslationsTable hash map.
+     *
+     * In terms of coalescing, we coalesce requests in a given
+     * window of x cycles by using tick_index = issueTime/x as a
+     * key, where x = coalescingWindow. issueTime is the issueTime
+     * of the pkt from the ComputeUnit's perspective, but another
+     * option is to change it to curTick(), so we coalesce based
+     * on the receive time.
+     */
+    typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+
+    CoalescingFIFO coalescerFIFO;
+
+    /*
+     * issuedTranslationsTabler: a hash_map indexed by virtual page
+     * address. Each hash_map entry has a vector of PacketPtr associated
+     * with it denoting the different packets that share an outstanding
+     * coalesced translation request for the same virtual page.
+     *
+     * The rules that determine which requests we can coalesce are
+     * specified in the canCoalesce() method.
+     */
+    typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
+
+    CoalescingTable issuedTranslationsTable;
+
+    // number of packets the coalescer receives
+    Stats::Scalar uncoalescedAccesses;
+    // number packets the coalescer send to the TLB
+    Stats::Scalar coalescedAccesses;
+
+    // Number of cycles the coalesced requests spend waiting in
+    // coalescerFIFO. For each packet the coalescer receives we take into
+    // account the number of all uncoalesced requests this pkt "represents"
+    Stats::Scalar queuingCycles;
+
+    // On average how much time a request from the
+    // uncoalescedAccesses that reaches the TLB
+    // spends waiting?
+    Stats::Scalar localqueuingCycles;
+    // localqueuingCycles/uncoalescedAccesses
+    Stats::Formula localLatency;
+
+    bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
+    void updatePhysAddresses(PacketPtr pkt);
+    void regStats();
+
+    // Clock related functions. Maps to-and-from
+    // Simulation ticks and object clocks.
+    Tick frequency() const { return SimClock::Frequency / clock; }
+    Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+    Tick curCycle() const { return curTick() / clock; }
+    Tick tickToCycles(Tick val) const { return val / clock;}
+
+    class CpuSidePort : public SlavePort
+    {
+      public:
+        CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+        }
+
+        virtual AddrRangeList getAddrRanges() const;
+    };
+
+    class MemSidePort : public MasterPort
+    {
+      public:
+        MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+                    PortID _index)
+            : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+              index(_index) { }
+
+        std::deque<PacketPtr> retries;
+
+      protected:
+        TLBCoalescer *coalescer;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        recvRespRetry()
+        {
+            fatal("recvRespRetry() not implemented in TLB coalescer");
+        }
+    };
+
+    // Coalescer slave ports on the cpu Side
+    std::vector<CpuSidePort*> cpuSidePort;
+    // Coalescer master ports on the memory side
+    std::vector<MemSidePort*> memSidePort;
+
+    BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
+    BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);
+
+    class IssueProbeEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        IssueProbeEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char *description() const;
+    };
+
+    // this event issues the TLB probes
+    IssueProbeEvent probeTLBEvent;
+
+    // the cleanupEvent is scheduled after a TLBEvent triggers
+    // in order to free memory and do the required clean-up
+    class CleanupEvent : public Event
+    {
+      private:
+        TLBCoalescer *coalescer;
+
+      public:
+        CleanupEvent(TLBCoalescer *_coalescer);
+        void process();
+        const char* description() const;
+     };
+
+    // schedule cleanup
+    CleanupEvent cleanupEvent;
+
+    // this FIFO queue keeps track of the virt. page
+    // addresses that are pending cleanup
+    std::queue<Addr> cleanupQueue;
+};
+
+#endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
new file mode 100644
index 000000000..8b7dc0691
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.cc
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_file.hh"
+
+#include <string>
+
+#include "base/misc.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/VectorRegisterFile.hh"
+
+VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
+    : SimObject(p),
+      manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
+      simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
+      vgprState(new VecRegisterState())
+{
+    fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
+    fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+    fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
+             "multiple of VRF size\n");
+
+    busy.clear();
+    busy.resize(numRegsPerSimd, 0);
+    nxtBusy.clear();
+    nxtBusy.resize(numRegsPerSimd, 0);
+
+    vgprState->init(numRegsPerSimd);
+}
+
+void
+VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    vgprState->setParent(computeUnit);
+}
+
+uint8_t
+VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = nxtBusy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (nxtBusy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+uint8_t
+VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
+{
+    uint8_t status = busy.at(idx);
+
+    if (operandSize > 4) {
+        status = status | (busy.at((idx + 1) % numRegs()));
+    }
+
+    return status;
+}
+
+void
+VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    nxtBusy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        nxtBusy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+void
+VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+    busy.at(regIdx) = value;
+
+    if (operandSize > 4) {
+        busy.at((regIdx + 1) % numRegs()) = value;
+    }
+}
+
+bool
+VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i)) {
+            uint32_t vgprIdx = ii->getRegisterIndex(i);
+            uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+
+            if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+
+            if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+                if (ii->isDstOperand(i)) {
+                    w->numTimesBlockedDueWAXDependencies++;
+                } else if (ii->isSrcOperand(i)) {
+                    w->numTimesBlockedDueRAWDependencies++;
+                }
+
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+void
+VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+{
+    bool loadInstr = IS_OT_READ(ii->opType());
+    bool atomicInstr = IS_OT_ATOMIC(ii->opType());
+
+    bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
+
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+
+            // mark the destination vector register as busy
+            markReg(physReg, ii->getOperandSize(i), 1);
+            // clear the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 0);
+
+            // FIXME: if we ever model correct timing behavior
+            // for load argument instructions then we should not
+            // set the destination register as busy now but when
+            // the data returns. Loads and Atomics should free
+            // their destination registers when the data returns,
+            // not now
+            if (!atomicInstr && !loadNoArgInstr) {
+                uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
+                    computeUnit->spBypassLength() :
+                    computeUnit->dpBypassLength();
+
+                // schedule an event for marking the register as ready
+                computeUnit->registerEvent(w->simdId, physReg,
+                                           ii->getOperandSize(i),
+                                           computeUnit->shader->tick_cnt +
+                                           computeUnit->shader->ticks(pipeLen),
+                                           0);
+            }
+        }
+    }
+}
+
+int
+VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
+                         std::vector<uint32_t> &regVec, uint32_t operandSize,
+                         uint64_t timestamp)
+{
+    int delay = 0;
+
+    panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
+             regVec.size());
+
+    for (int i = 0; i < regVec.size(); ++i) {
+        // mark the destination VGPR as free when the timestamp expires
+        computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
+                                   computeUnit->shader->tick_cnt + timestamp +
+                                   computeUnit->shader->ticks(delay), 0);
+    }
+
+    return delay;
+}
+
+void
+VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+{
+    // iterate over all register destination operands
+    for (int i = 0; i < ii->getNumOperands(); ++i) {
+        if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+            uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+                                        ii->getOperandSize(i), 1);
+            // set the in-flight status of the destination vector register
+            preMarkReg(physReg, ii->getOperandSize(i), 1);
+        }
+    }
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                          GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                          VrfAccessType accessType)
+{
+    bool ready = true;
+
+    return ready;
+}
+
+VectorRegisterFile*
+VectorRegisterFileParams::create()
+{
+    return new VectorRegisterFile(this);
+}
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
new file mode 100644
index 000000000..1cb011a1e
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_FILE_HH__
+#define __VECTOR_REGISTER_FILE_HH__
+
+#include <list>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/vector_register_state.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class SimplePoolManager;
+class Wavefront;
+
+struct VectorRegisterFileParams;
+
+enum class VrfAccessType : uint8_t
+{
+    READ = 0x01,
+    WRITE = 0x02,
+    RD_WR = READ | WRITE
+};
+
+// Vector Register File
+class VectorRegisterFile : public SimObject
+{
+  public:
+    VectorRegisterFile(const VectorRegisterFileParams *p);
+
+    void setParent(ComputeUnit *_computeUnit);
+
+    // Read a register
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0)
+    {
+        T p0 = vgprState->read<T>(regIdx, threadId);
+
+        return p0;
+    }
+
+    // Write a register
+    template<typename T>
+    void
+    write(int regIdx, T value, int threadId=0)
+    {
+        vgprState->write<T>(regIdx, value, threadId);
+    }
+
+    uint8_t regBusy(int idx, uint32_t operandSize) const;
+    uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
+
+    int numRegs() const { return numRegsPerSimd; }
+
+    void markReg(int regIdx, uint32_t operandSize, uint8_t value);
+    void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
+
+    virtual void exec(GPUDynInstPtr ii, Wavefront *w);
+
+    virtual int exec(uint64_t dynamic_id, Wavefront *w,
+                     std::vector<uint32_t> &regVec, uint32_t operandSize,
+                     uint64_t timestamp);
+
+    bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+    virtual void updateEvents() { }
+    virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
+
+    virtual bool
+    isReadConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool
+    isWriteConflict(int memWfId, int exeWfId) const
+    {
+        return false;
+    }
+
+    virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+                                       GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+                                       VrfAccessType accessType);
+
+    SimplePoolManager *manager;
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId;
+
+    // flag indicating if a register is busy
+    std::vector<uint8_t> busy;
+    // flag indicating if a register will be busy (by instructions
+    // in the SIMD pipeline)
+    std::vector<uint8_t> nxtBusy;
+
+    // numer of registers (bank size) per simd unit (bank)
+    int numRegsPerSimd;
+
+    // vector register state
+    VecRegisterState *vgprState;
+};
+
+#endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
new file mode 100644
index 000000000..f231b0579
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+
+VecRegisterState::VecRegisterState() : computeUnit(nullptr)
+{
+    s_reg.clear();
+    d_reg.clear();
+}
+
+void
+VecRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+    _name = computeUnit->name() + ".VecRegState";
+}
+
+void
+VecRegisterState::init(uint32_t _size)
+{
+    s_reg.resize(_size);
+    d_reg.resize(_size);
+}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
new file mode 100644
index 000000000..a233b9acc
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_STATE_HH__
+#define __VECTOR_REGISTER_STATE_HH__
+
+#include <array>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+
+// Vector Register State per SIMD unit (contents of the vector
+// registers in the VRF of the SIMD)
+class VecRegisterState
+{
+  public:
+    VecRegisterState();
+    void init(uint32_t _size);
+
+    const std::string& name() const { return _name; }
+    void setParent(ComputeUnit *_computeUnit);
+    void regStats() { }
+
+    // Access methods
+    template<typename T>
+    T
+    read(int regIdx, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        return *p0;
+    }
+
+    template<typename T>
+    void
+    write(unsigned int regIdx, T value, int threadId=0) {
+        T *p0;
+        assert(sizeof(T) == 4 || sizeof(T) == 8);
+        if (sizeof(T) == 4) {
+            p0 = (T*)(&s_reg[regIdx][threadId]);
+        } else {
+            p0 = (T*)(&d_reg[regIdx][threadId]);
+        }
+
+        *p0 = value;
+    }
+
+    // (Single Precision) Vector Register File size.
+    int regSize() { return s_reg.size(); }
+
+  private:
+    ComputeUnit *computeUnit;
+    std::string _name;
+    // 32-bit Single Precision Vector Register State
+    std::vector<std::array<uint32_t, VSZ>> s_reg;
+    // 64-bit Double Precision Vector Register State
+    std::vector<std::array<uint64_t, VSZ>> d_reg;
+};
+
+#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
new file mode 100644
index 000000000..0aa033db1
--- /dev/null
+++ b/src/gpu-compute/wavefront.cc
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/wavefront.hh"
+
+#include "debug/GPUExec.hh"
+#include "debug/WavefrontStack.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+
+Wavefront*
+WavefrontParams::create()
+{
+    return new Wavefront(this);
+}
+
+Wavefront::Wavefront(const Params *p)
+  : SimObject(p), callArgMem(nullptr)
+{
+    last_trace = 0;
+    simdId = p->simdId;
+    wfSlotId = p->wf_slot_id;
+
+    status = S_STOPPED;
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+    outstanding_reqs = 0;
+    mem_reqs_in_pipe = 0;
+    outstanding_reqs_wr_gm = 0;
+    outstanding_reqs_wr_lm = 0;
+    outstanding_reqs_rd_gm = 0;
+    outstanding_reqs_rd_lm = 0;
+    rd_lm_reqs_in_pipe = 0;
+    rd_gm_reqs_in_pipe = 0;
+    wr_lm_reqs_in_pipe = 0;
+    wr_gm_reqs_in_pipe = 0;
+
+    barrier_cnt = 0;
+    old_barrier_cnt = 0;
+    stalledAtBarrier = false;
+
+    mem_trace_busy = 0;
+    old_vgpr_tcnt = 0xffffffffffffffffll;
+    old_dgpr_tcnt = 0xffffffffffffffffll;
+
+    pendingFetch = false;
+    dropFetch = false;
+    condRegState = new ConditionRegisterState();
+    maxSpVgprs = 0;
+    maxDpVgprs = 0;
+}
+
+void
+Wavefront::regStats()
+{
+    srcRegOpDist
+        .init(0, 4, 2)
+        .name(name() + ".src_reg_operand_dist")
+        .desc("number of executed instructions with N source register operands")
+        ;
+
+    dstRegOpDist
+        .init(0, 3, 2)
+        .name(name() + ".dst_reg_operand_dist")
+        .desc("number of executed instructions with N destination register "
+              "operands")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueWAXDependencies
+        .name(name() + ".timesBlockedDueWAXDependencies")
+        .desc("number of times the wf's instructions are blocked due to WAW "
+              "or WAR dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueRAWDependencies
+        .name(name() + ".timesBlockedDueRAWDependencies")
+        .desc("number of times the wf's instructions are blocked due to RAW "
+              "dependencies")
+        ;
+
+    // FIXME: the name of the WF needs to be unique
+    numTimesBlockedDueVrfPortAvail
+        .name(name() + ".timesBlockedDueVrfPortAvail")
+        .desc("number of times instructions are blocked due to VRF port "
+              "availability")
+        ;
+}
+
+void
+Wavefront::init()
+{
+    reservedVectorRegs = 0;
+    startVgprIndex = 0;
+}
+
+void
+Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+{
+    condRegState->init(num_cregs);
+    maxSpVgprs = num_sregs;
+    maxDpVgprs = num_dregs;
+}
+
+Wavefront::~Wavefront()
+{
+    if (callArgMem)
+        delete callArgMem;
+}
+
+void
+Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
+{
+    wfDynId = _wfDynId;
+    base_ptr = _base_ptr;
+    status = S_RUNNING;
+}
+
+bool
+Wavefront::isGmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+        IS_OT_ATOMIC_PM(ii->opType())) {
+        return true;
+    }
+
+    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+        IS_OT_ATOMIC_GM(ii->opType())) {
+
+        return true;
+    }
+
+    if (IS_OT_FLAT(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isLmInstruction(GPUDynInstPtr ii)
+{
+    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
+        IS_OT_ATOMIC_LM(ii->opType())) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstALU()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
+        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+        ii->opType() == Enums::OT_KERN_READ)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstBarrier()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstGMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
+        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstLMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
+        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstPrivMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
+        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool
+Wavefront::isOldestInstFlatMem()
+{
+    assert(!instructionBuffer.empty());
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
+
+        return true;
+    }
+
+    return false;
+}
+
+// Return true if the Wavefront's instruction
+// buffer has branch instruction.
+bool
+Wavefront::instructionBufferHasBranch()
+{
+    for (auto it : instructionBuffer) {
+        GPUDynInstPtr ii = it;
+
+        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// Remap HSAIL register to physical VGPR.
+// HSAIL register = virtual register assigned to an operand by HLC compiler
+uint32_t
+Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+{
+    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
+    // add the offset from where the VGPRs of the wavefront have been assigned
+    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
+    // HSAIL double precision (DP) register: calculate the physical VGPR index
+    // assuming that DP registers are placed after SP ones in the VRF. The DP
+    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
+    // the DP VGPR index before mapping it to the physical VRF address space
+    if (mode == 1 && size > 4) {
+        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
+    }
+
+    assert((startVgprIndex <= physicalVgprIndex) &&
+           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
+
+    // calculate absolute physical VGPR index
+    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+}
+
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+int
+Wavefront::ready(itype_e type)
+{
+    // Check to make sure wave is running
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return 0;
+    }
+
+    // Is the wave waiting at a barrier
+    if (stalledAtBarrier) {
+        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
+                        computeUnit->getRefCounter(dispatchid, wg_id))) {
+            // Are all threads at barrier?
+            return 0;
+        }
+        old_barrier_cnt = barrier_cnt;
+        stalledAtBarrier = false;
+    }
+
+    // Read instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    bool ready_inst M5_VAR_USED = false;
+    bool glbMemBusRdy = false;
+    bool glbMemIssueRdy = false;
+    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
+        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
+            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
+                glbMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                glbMemIssueRdy = true;
+        }
+    }
+    bool locMemBusRdy = false;
+    bool locMemIssueRdy = false;
+    if (type == I_SHARED) {
+        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
+            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
+                locMemBusRdy = true;
+            if (computeUnit->wfWait[j].prerdy())
+                locMemIssueRdy = true;
+        }
+    }
+
+    // The following code is very error prone and the entire process for
+    // checking readiness will be fixed eventually.  In the meantime, let's
+    // make sure that we do not silently let an instruction type slip
+    // through this logic and always return not ready.
+    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
+          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+          ii->opType() == Enums::OT_KERN_READ ||
+          ii->opType() == Enums::OT_ARG ||
+          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
+          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
+        panic("next instruction: %s is of unknown type\n", ii->disassemble());
+    }
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
+            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
+
+    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
+        // Here for ALU instruction (barrier)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
+        // Here for ALU instruction (nop)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
+        // Here for ALU instruction (return)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        // Are there in pipe or outstanding memory requests?
+        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+            return 0;
+        }
+
+        ready_inst = true;
+    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
+               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+               ii->opType() == Enums::OT_KERN_READ ||
+               ii->opType() == Enums::OT_ARG)) {
+        // Here for ALU instruction (all others)
+        if (!computeUnit->wfWait[simdId].prerdy()) {
+            // Is alu slot free?
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
+               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+        // Here Global memory instruction
+        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory write requests?
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
+            IS_OT_HIST_GM(ii->opType())) {
+            // Are there in pipe or outstanding global memory read requests?
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
+                return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is WV issue slot free?
+            return 0;
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
+               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+        // Here for Shared memory instruction
+        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
+            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+            IS_OT_HIST_LM(ii->opType())) {
+            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+        if (!locMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
+               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+        // Here for Private memory instruction ------------------------    //
+        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
+            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
+            IS_OT_HIST_PM(ii->opType())) {
+            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
+                return 0;
+            }
+        }
+
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+             // Is wave slot free?
+            return 0;
+        }
+
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
+        if (!glbMemBusRdy) {
+            // Is there an available VRF->Global memory read bus?
+            return 0;
+        }
+
+        if (!locMemBusRdy) {
+            // Is there an available VRF->LDS read bus?
+            return 0;
+        }
+
+        if (!glbMemIssueRdy) {
+            // Is wave slot free?
+            return 0;
+        }
+
+        if (!locMemIssueRdy) {
+            return 0;
+        }
+        if (!computeUnit->globalMemoryPipe.
+            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+            // Can we insert a new request to the Global Mem Request FIFO?
+            return 0;
+        }
+
+        if (!computeUnit->localMemoryPipe.
+            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+            // Can we insert a new request to the LDS Request FIFO?
+            return 0;
+        }
+        // can we schedule source & destination operands on the VRF?
+        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+                    VrfAccessType::RD_WR)) {
+            return 0;
+        }
+        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
+        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+            return 0;
+        }
+        ready_inst = true;
+    } else {
+        return 0;
+    }
+
+    assert(ready_inst);
+
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+            simdId, wfSlotId, ii->disassemble());
+
+    return 1;
+}
+
+void
+Wavefront::updateResources()
+{
+    // Get current instruction
+    GPUDynInstPtr ii = instructionBuffer.front();
+    assert(ii);
+    computeUnit->vrf[simdId]->updateResources(this, ii);
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType()==Enums::OT_KERN_READ ||
+        ii->opType()==Enums::OT_ARG ||
+        ii->opType()==Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
+                                            ticks(computeUnit->spBypassLength()));
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].preset(computeUnit->shader->
+                                           ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        if ( Enums::SC_SHARED == ii->executedAs() ) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                preset(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_lm_reqs_in_pipe++;
+        rd_lm_reqs_in_pipe++;
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
+        mem_reqs_in_pipe++;
+        wr_gm_reqs_in_pipe++;
+        rd_gm_reqs_in_pipe++;
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            preset(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+void
+Wavefront::exec()
+{
+    // ---- Exit if wavefront is inactive ----------------------------- //
+
+    if (status == S_STOPPED || status == S_RETURNING ||
+        instructionBuffer.empty()) {
+        return;
+    }
+
+    // Get current instruction
+
+    GPUDynInstPtr ii = instructionBuffer.front();
+
+    const uint32_t old_pc = pc();
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
+            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            ii->disassemble(), old_pc);
+    ii->execute();
+    // access the VRF
+    computeUnit->vrf[simdId]->exec(ii, this);
+    srcRegOpDist.sample(ii->numSrcRegOperands());
+    dstRegOpDist.sample(ii->numDstRegOperands());
+    computeUnit->numInstrExecuted++;
+    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
+                                     computeUnit->lastExecCycle[simdId]);
+    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+    if (pc() == old_pc) {
+        uint32_t new_pc = old_pc + 1;
+        // PC not modified by instruction, proceed to next or pop frame
+        pc(new_pc);
+        if (new_pc == rpc()) {
+            popFromReconvergenceStack();
+            discardFetch();
+        } else {
+            instructionBuffer.pop_front();
+        }
+    }
+
+    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
+        const int num_active_lanes = execMask().count();
+        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
+        computeUnit->numVecOpsExecuted += num_active_lanes;
+        if (isGmInstruction(ii)) {
+            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+        } else if (isLmInstruction(ii)) {
+            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+        }
+    }
+
+    // ---- Update Vector ALU pipeline and other resources ------------------ //
+    // Single precision ALU or Branch or Return or Special instruction
+    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+        // FIXME: Kernel argument loads are currently treated as ALU operations
+        // since we don't send memory packets at execution. If we fix that then
+        // we should map them to one of the memory pipelines
+        ii->opType() == Enums::OT_KERN_READ ||
+        ii->opType() == Enums::OT_ARG ||
+        ii->opType() == Enums::OT_RET) {
+        computeUnit->aluPipe[simdId].set(computeUnit->shader->
+                                         ticks(computeUnit->spBypassLength()));
+
+        // this is to enforce a fixed number of cycles per issue slot per SIMD
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_BARRIER) {
+        computeUnit->wfWait[simdId].set(computeUnit->shader->
+                                        ticks(computeUnit->issuePeriod));
+    } else if (ii->opType() == Enums::OT_FLAT_READ) {
+        assert(Enums::SC_NONE != ii->executedAs());
+
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(4));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+        assert(Enums::SC_NONE != ii->executedAs());
+        if (Enums::SC_SHARED == ii->executedAs()) {
+            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        } else {
+            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+                set(computeUnit->shader->ticks(8));
+            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+        }
+    } else if (IS_OT_READ_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_READ_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(4));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_WRITE_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+            set(computeUnit->shader->ticks(8));
+        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+    }
+}
+
+bool
+Wavefront::waitingAtBarrier(int lane)
+{
+    return bar_cnt[lane] < max_bar_cnt;
+}
+
+void
+Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                    const VectorMask& mask)
+{
+    assert(mask.count());
+    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
+}
+
+void
+Wavefront::popFromReconvergenceStack()
+{
+    assert(!reconvergenceStack.empty());
+
+    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
+            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str(), pc());
+
+    reconvergenceStack.pop();
+
+    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
+            execMask().to_string<char, std::string::traits_type,
+            std::string::allocator_type>().c_str());
+
+}
+
+void
+Wavefront::discardFetch()
+{
+    instructionBuffer.clear();
+    dropFetch |=pendingFetch;
+}
+
+uint32_t
+Wavefront::pc() const
+{
+    return reconvergenceStack.top()->pc;
+}
+
+uint32_t
+Wavefront::rpc() const
+{
+    return reconvergenceStack.top()->rpc;
+}
+
+VectorMask
+Wavefront::execMask() const
+{
+    return reconvergenceStack.top()->execMask;
+}
+
+bool
+Wavefront::execMask(int lane) const
+{
+    return reconvergenceStack.top()->execMask[lane];
+}
+
+
+void
+Wavefront::pc(uint32_t new_pc)
+{
+    reconvergenceStack.top()->pc = new_pc;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ *   * All work-items in a kernel refer the same set of kernel arguments
+ *   * Each work-item has it's on set of call args. So a call argument at
+ *     address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ *  ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
+ *  ___________________________________________________
+ */
+class CallArgMem
+{
+  public:
+    // pointer to buffer for storing function arguments
+    uint8_t *mem;
+    // size of function args
+    int funcArgsSizePerItem;
+
+    template<typename CType>
+    int
+    getLaneOffset(int lane, int addr)
+    {
+        return addr * VSZ + sizeof(CType) * lane;
+    }
+
+    CallArgMem(int func_args_size_per_item)
+      : funcArgsSizePerItem(func_args_size_per_item)
+    {
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+    }
+
+    ~CallArgMem()
+    {
+        free(mem);
+    }
+
+    template<typename CType>
+    uint8_t*
+    getLaneAddr(int lane, int addr)
+    {
+        return mem + getLaneOffset<CType>(lane, addr);
+    }
+
+    template<typename CType>
+    void
+    setLaneAddr(int lane, int addr, CType val)
+    {
+        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+    }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+  public:
+    ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+                            VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+                            execMask(new_mask) {
+    }
+
+    /**
+     * PC of current instruction.
+     */
+    uint32_t pc;
+    /**
+     * PC of the immediate post-dominator instruction, i.e., the value of
+     * @a pc for the first instruction that will be executed by the wavefront
+     * when a reconvergence point is reached.
+     */
+    uint32_t rpc;
+    /**
+     * Execution mask.
+     */
+    VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+  public:
+    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+    // Base pointer for array of instruction pointers
+    uint64_t base_ptr;
+
+    uint32_t old_barrier_cnt;
+    uint32_t barrier_cnt;
+    uint32_t barrier_id;
+    uint32_t barrier_slots;
+    status_e status;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    int kern_id;
+    // SIMD unit where the WV has been scheduled
+    int simdId;
+    // pointer to parent CU
+    ComputeUnit *computeUnit;
+
+    std::deque<GPUDynInstPtr> instructionBuffer;
+
+    bool pendingFetch;
+    bool dropFetch;
+
+    // Condition Register State (for HSAIL simulations only)
+    class ConditionRegisterState *condRegState;
+    // number of single precision VGPRs required by WF
+    uint32_t maxSpVgprs;
+    // number of double precision VGPRs required by WF
+    uint32_t maxDpVgprs;
+    // map virtual to physical vector register
+    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    bool isGmInstruction(GPUDynInstPtr ii);
+    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstGMem();
+    bool isOldestInstLMem();
+    bool isOldestInstPrivMem();
+    bool isOldestInstFlatMem();
+    bool isOldestInstALU();
+    bool isOldestInstBarrier();
+    // used for passing spill address to DDInstGPU
+    uint64_t last_addr[VSZ];
+    uint32_t workitemid[3][VSZ];
+    uint32_t workitemFlatId[VSZ];
+    uint32_t workgroupid[3];
+    uint32_t workgroupsz[3];
+    uint32_t gridsz[3];
+    uint32_t wg_id;
+    uint32_t wg_sz;
+    uint32_t dynwaveid;
+    uint32_t maxdynwaveid;
+    uint32_t dispatchid;
+    // outstanding global+local memory requests
+    uint32_t outstanding_reqs;
+    // memory requests between scoreboard
+    // and execute stage not yet executed
+    uint32_t mem_reqs_in_pipe;
+    // outstanding global memory write requests
+    uint32_t outstanding_reqs_wr_gm;
+    // outstanding local memory write requests
+    uint32_t outstanding_reqs_wr_lm;
+    // outstanding global memory read requests
+    uint32_t outstanding_reqs_rd_gm;
+    // outstanding local memory read requests
+    uint32_t outstanding_reqs_rd_lm;
+    uint32_t rd_lm_reqs_in_pipe;
+    uint32_t rd_gm_reqs_in_pipe;
+    uint32_t wr_lm_reqs_in_pipe;
+    uint32_t wr_gm_reqs_in_pipe;
+
+    int mem_trace_busy;
+    uint64_t last_trace;
+    // number of vector registers reserved by WF
+    int reservedVectorRegs;
+    // Index into the Vector Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startVgprIndex;
+
+    // Old value of destination gpr (for trace)
+    uint32_t old_vgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_vgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_vgpr_tcnt;
+
+    // Old value of destination gpr (for trace)
+    uint64_t old_dgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_dgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_dgpr_tcnt;
+
+    // Execution mask at wavefront start
+    VectorMask init_mask;
+
+    // number of barriers this WF has joined
+    int bar_cnt[VSZ];
+    int max_bar_cnt;
+    // Flag to stall a wave on barrier
+    bool stalledAtBarrier;
+
+    // a pointer to the fraction of the LDS allocated
+    // to this workgroup (thus this wavefront)
+    LdsChunk *ldsChunk;
+
+    // A pointer to the spill area
+    Addr spillBase;
+    // The size of the spill area
+    uint32_t spillSizePerItem;
+    // The vector width of the spill area
+    uint32_t spillWidth;
+
+    // A pointer to the private memory area
+    Addr privBase;
+    // The size of the private memory area
+    uint32_t privSizePerItem;
+
+    // A pointer ot the read-only memory area
+    Addr roBase;
+    // size of the read-only memory area
+    uint32_t roSize;
+
+    // pointer to buffer for storing kernel arguments
+    uint8_t *kernelArgs;
+    // unique WF id over all WFs executed across all CUs
+    uint64_t wfDynId;
+
+    // number of times instruction issue for this wavefront is blocked
+    // due to VRF port availability
+    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueWAXDependencies;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueRAWDependencies;
+    // distribution of executed instructions based on their register
+    // operands; this is used to highlight the load on the VRF
+    Stats::Distribution srcRegOpDist;
+    Stats::Distribution dstRegOpDist;
+
+    // Functions to operate on call argument memory
+    // argument memory for hsail call instruction
+    CallArgMem *callArgMem;
+    void
+    initCallArgMem(int func_args_size_per_item)
+    {
+        callArgMem = new CallArgMem(func_args_size_per_item);
+    }
+
+    template<typename CType>
+    CType
+    readCallArgMem(int lane, int addr)
+    {
+        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+    }
+
+    template<typename CType>
+    void
+    writeCallArgMem(int lane, int addr, CType val)
+    {
+        callArgMem->setLaneAddr<CType>(lane, addr, val);
+    }
+
+    typedef WavefrontParams Params;
+    Wavefront(const Params *p);
+    ~Wavefront();
+    virtual void init();
+
+    void
+    setParent(ComputeUnit *cu)
+    {
+        computeUnit = cu;
+    }
+
+    void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+    void exec();
+    void updateResources();
+    int ready(itype_e type);
+    bool instructionBufferHasBranch();
+    void regStats();
+    VectorMask get_pred() { return execMask() & init_mask; }
+
+    bool waitingAtBarrier(int lane);
+
+    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                  const VectorMask& exec_mask);
+
+    void popFromReconvergenceStack();
+
+    uint32_t pc() const;
+
+    uint32_t rpc() const;
+
+    VectorMask execMask() const;
+
+    bool execMask(int lane) const;
+
+    void pc(uint32_t new_pc);
+
+    void discardFetch();
+
+  private:
+    /**
+     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+     * to be visited by the wavefront, and the associated execution masks. The
+     * reconvergence stack grows every time the wavefront reaches a divergence
+     * point (branch instruction), and shrinks every time the wavefront
+     * reaches a reconvergence point (immediate post-dominator instruction).
+     */
+    std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__
diff --git a/src/mem/protocol/GPU_RfO-SQC.sm b/src/mem/protocol/GPU_RfO-SQC.sm
new file mode 100644
index 000000000..1e5f8df74
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-SQC.sm
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 80;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+
+    I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet";
+    S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB";
+  }
+
+  enumeration(Event, desc="SQC Events") {
+    // Core initiated
+    Fetch,          desc="Fetch";
+
+    //TCC initiated
+    TCC_AckS,        desc="TCC Ack to Core Request";
+    TCC_AckWB,       desc="TCC Ack for WB";
+    TCC_NackWB,       desc="TCC Nack for WB";
+
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return M data";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return SQC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return SQC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(SQC_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+  out_port(responseNetwork_out, ResponseMsg, responseFromSQC);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // In Ports
+
+  in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+    if (responseToSQC_in.isReady(clockEdge())) {
+      peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("SQC should not receive TDSysResp other than CoherenceState:Shared");
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+          trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        assert(in_msg.Type == RubyRequestType:IFETCH);
+        if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+          trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToSQC_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    sequencer.readCallback(address, cache_entry.DataBlk,
+                           false, MachineType:L1Cache);
+    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+  }
+
+  action(xl_loadDone, "xl", desc="remote load done") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      sequencer.readCallback(address,
+                             cache_entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+      APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+    }
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToSQC_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToSQC_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} {
+    a_allocate;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition(S, Fetch) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  // recycles from transients
+  transition({I_S, S_I, I_C}, {Fetch, Repl}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition(S, Repl, S_I) {TagArrayRead} {
+    t_allocateTBE;
+    vc_victim;
+    ic_invCache;
+  }
+
+  // TCC event
+  transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_I, TCC_NackWB, I){TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S_I, TCC_AckWB, I) {TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_AckWB, I){TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // Probe transitions
+  transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({S}, PrbShrData, S) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {TagArrayRead} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C){
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_S, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    a_allocate;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition(I_S, PrbShrData) {} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbInvData, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbInv, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_I, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_RfO-TCC.sm b/src/mem/protocol/GPU_RfO-TCC.sm
new file mode 100644
index 000000000..cfddb3f00
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCC.sm
@@ -0,0 +1,1199 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   WireBuffer * w_reqToTCCDir;
+   WireBuffer * w_respToTCCDir;
+   WireBuffer * w_TCCUnblockToTCCDir;
+   WireBuffer * w_reqToTCC;
+   WireBuffer * w_probeToTCC;
+   WireBuffer * w_respToTCC;
+   int TCC_select_num_bits;
+   Cycles l2_request_latency := 1;
+   Cycles l2_response_latency := 20;
+
+  // To the general response network
+  MessageBuffer * responseFromTCC, network="To", virtual_network="3", vnet_type="response";
+
+  // From the general response network
+  MessageBuffer * responseToTCC, network="From", virtual_network="3", vnet_type="response";
+
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="CPU RdBlk event";
+    RdBlkM,                 desc="CPU RdBlkM event";
+    RdBlkS,                 desc="CPU RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+    WrVicBlk,               desc="L1 Victim (dirty)";
+    WrVicBlkShared,               desc="L1 Victim (dirty)";
+    ClVicBlk,               desc="L1 Victim (clean)";
+    ClVicBlkShared,               desc="L1 Victim (clean)";
+
+    CPUData,                      desc="WB data from CPU";
+    CPUDataShared,                desc="WB data from CPU, NBReqShared 1";
+    StaleWB,                desc="Stale WB, No data";
+
+    L2_Repl,             desc="L2 Replacement";
+
+    // Probes
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+
+    // Coming from Memory Controller
+    WBAck,                     desc="ack from memory";
+
+    CancelWB,                   desc="Cancel WB from L2";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified";  // No other cache has copy, memory stale
+    O, AccessPermission:Read_Only, desc="Owned";     // Correct most recent copy, others may exist in S
+    E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+    S, AccessPermission:Read_Only, desc="Shared";    // Correct, most recent. If no one in O, then == Memory
+    I, AccessPermission:Invalid, desc="Invalid";
+
+    I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+    S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+    S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+    E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+    O_M, AccessPermission:Busy, desc="...";
+    O_O, AccessPermission:Busy, desc="...";
+    O_E, AccessPermission:Busy, desc="...";
+    M_M, AccessPermission:Busy, desc="...";
+    M_O, AccessPermission:Busy, desc="...";
+    M_E, AccessPermission:Busy, desc="...";
+    M_S, AccessPermission:Busy, desc="...";
+    D_I, AccessPermission:Invalid,  desc="drop WB data on the floor when receive";
+    MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+    MO_I, AccessPermission:Busy, desc="M or O, received L2_Repl, waiting for WBAck from Mem";
+    ES_I, AccessPermission:Busy, desc="E or S, received L2_Repl, waiting for WBAck from Mem";
+    I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+
+  // OUT PORTS
+  out_port(w_requestNetwork_out, CPURequestMsg, w_reqToTCCDir);
+  out_port(w_TCCResp_out, ResponseMsg, w_respToTCCDir);
+  out_port(responseNetwork_out, ResponseMsg, responseFromTCC);
+  out_port(w_unblockNetwork_out, UnblockMsg, w_TCCUnblockToTCCDir);
+
+  // IN PORTS
+  in_port(TDResponse_in, ResponseMsg, w_respToTCC) {
+    if (TDResponse_in.isReady(clockEdge())) {
+      peek(TDResponse_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        }
+        else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on TDResponse Type");
+        }
+      }
+    }
+  }
+
+  // Response Network
+  in_port(responseNetwork_in, ResponseMsg, responseToTCC) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUData) {
+          if (in_msg.NbReqShared) {
+            trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on TDResponse Type");
+        }
+      }
+    }
+  }
+
+  // probe network
+  in_port(probeNetwork_in, TDProbeRequestMsg, w_probeToTCC) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, TDProbeRequestMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Don't think I should get any of these");
+          }
+        }
+      }
+    }
+  }
+
+  // Request Network
+  in_port(requestNetwork_in, CPURequestMsg, w_reqToTCC) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        assert(in_msg.Destination.isElement(machineID));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+            requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+        }
+      }
+    }
+  }
+
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(rm_sendResponseM, "rm", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(rs_sendResponseS, "rs", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(r_requestToTD, "r", desc="Miss in L2, pass on") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.Shared := false; // unneeded for this request
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    if (is_valid(cache_entry)) {
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := cache_entry.Dirty;
+    }
+    tbe.From := machineID;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(vc_vicClean, "vc", desc="Victimize Clean L2 data") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.Requestor := machineID;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(vd_vicDirty, "vd", desc="Victimize dirty L2 data") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.Requestor := machineID;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      //assert(cache_entry.Dirty); Not needed in TCC where TCC can supply clean data
+      out_msg.Dirty := cache_entry.Dirty;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := tbe.DataBlk;
+      //assert(tbe.Dirty);
+      out_msg.Dirty := tbe.Dirty;
+      out_msg.Hit := true;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.State := CoherenceState:NA;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+    enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(d_writeData, "d", desc="write data to TCC") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        cache_entry.Dirty := in_msg.Dirty;
+      }
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(rd_copyDataFromRequest, "rd", desc="write data to TCC") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(f_setFrom, "f", desc="set who WB is expected to come from") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.From := in_msg.Requestor;
+    }
+  }
+
+  action(rf_resetFrom, "rf", desc="reset From") {
+    tbe.From := machineID;
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(uo_sendUnblockOwner, "uo", desc="state changed to E, M, or O, unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := true;
+      out_msg.valid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(us_sendUnblockSharer, "us", desc="state changed to S , unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := false;
+      out_msg.valid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(un_sendUnblockNotValid, "un", desc="state changed toI, unblock") {
+    enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.currentOwner := false;
+      out_msg.valid := false;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popTDResponseQueue, "pn", desc="pop TD response queue") {
+    TDResponse_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+
+  transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}){TagArrayRead} {
+    // TCCdir already knows that the block is not here. This is to allocate and get the block.
+    r_requestToTD;
+    p_popRequestQueue;
+  }
+
+// check
+  transition({M, O}, RdBlk, O){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chancing
+    p_popRequestQueue;
+  }
+
+//check
+  transition({E, S}, RdBlk, S){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chancing
+    p_popRequestQueue;
+  }
+
+// check
+  transition({M, O}, RdBlkS, O){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chance sharing
+    p_popRequestQueue;
+  }
+
+//check
+  transition({E, S}, RdBlkS, S){TagArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    // detect 2nd chance sharing
+    p_popRequestQueue;
+  }
+
+// check
+  transition(M, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+    rm_sendResponseM;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+  //check
+  transition(E, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+    rm_sendResponseM;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+// check
+  transition({I}, WrVicBlk, I_M){TagArrayRead} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {
+    zz_recycleRequestQueue;
+  }
+
+//check
+  transition({I}, WrVicBlkShared, I_O) {TagArrayRead}{
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition(S, WrVicBlkShared, S_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(S, WrVicBlk, S_S){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+  transition(E, WrVicBlk, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, WrVicBlkShared, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(O, WrVicBlk, O_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(O, WrVicBlkShared, O_O){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+  transition(M, WrVicBlk, M_M){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(M, WrVicBlkShared, M_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition({I}, ClVicBlk, I_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlkShared, I_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+//check
+  transition(S, ClVicBlkShared, S_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, ClVicBlk, E_E){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+  transition(E, ClVicBlkShared, E_S){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(O, ClVicBlk, O_O){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// check. Original L3 ahd it going from O to O_S. Something can go from O to S only on writeback.
+  transition(O, ClVicBlkShared, O_O){TagArrayRead} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+// a stale writeback
+ transition(M, ClVicBlk, M_E){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, ClVicBlkShared, M_S){TagArrayRead} {
+   t_allocateTBE;
+   f_setFrom;
+   w_sendResponseWBAck;
+   p_popRequestQueue;
+ }
+
+
+  transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+    r_requestToTD;
+    p_popRequestQueue;
+  }
+
+  transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_M, CPUData, M){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUData, E){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(S_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite}  {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_E, CPUDataShared, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite}  {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition({D_I}, {CPUData, CPUDataShared}, I){TagArrayWrite}  {
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {
+    un_sendUnblockNotValid;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition({O,S,I}, CPUData) {
+    pr_popResponseQueue;
+  }
+
+  transition({M, O}, L2_Repl, MO_I){TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vd_vicDirty;
+    i_invL2;
+  }
+
+  transition({E, S,}, L2_Repl, ES_I){TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vc_vicClean;
+    i_invL2;
+  }
+
+  transition({I_M, I_O, S_M, S_O, E_M, E_O}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, M_M, M_O, M_E, M_S}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I_E, I_S, S_E, S_S, E_E, E_S}, L2_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({M, O}, PrbInvData, I){TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    i_invL2;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbInvData){TagArrayRead, TagArrayWrite}  {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbInvData, I){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    i_invL2;
+    pp_popProbeQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I){TagArrayRead, TagArrayWrite}  {
+    pi_sendProbeResponseInv;
+    i_invL2; // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O}, PrbShrData, O){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbShrData, S){TagArrayRead, TagArrayWrite}  {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbShrData){TagArrayRead}  {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({ES_I,MO_I}, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({ES_I, MO_I}, PrbShrData) {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInv}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbShrData) {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MOD_I, WBAck, D_I) {
+    pn_popTDResponseQueue;
+  }
+
+  transition(MO_I, WBAck, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  // this can only be a spurious CPUData from a shared block.
+  transition(MO_I, CPUData) {
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, WBAck, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  transition(I_C, {WBAck}, I){TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popTDResponseQueue;
+  }
+
+  transition({I_M, I_O, I_E, I_S}, StaleWB, I){TagArrayWrite} {
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    i_invL2;
+    pr_popResponseQueue;
+  }
+
+  transition({S_S, S_O, S_M, S_E}, StaleWB, S){TagArrayWrite} {
+    us_sendUnblockSharer;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({E_M, E_O, E_E, E_S}, StaleWB, E){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({O_M, O_O, O_E}, StaleWB, O){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({M_M, M_O, M_E, M_S}, StaleWB, M){TagArrayWrite} {
+    uo_sendUnblockOwner;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(D_I, StaleWB, I) {TagArrayWrite}{
+    un_sendUnblockNotValid;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, StaleWB, MO_I) {
+    un_sendUnblockNotValid;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCCdir.sm b/src/mem/protocol/GPU_RfO-TCCdir.sm
new file mode 100644
index 000000000..8f58d6ebb
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCCdir.sm
@@ -0,0 +1,2672 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mithuna Thottethodi
+ */
+
+machine(MachineType:TCCdir, "AMD read-for-ownership directory for TCC (aka GPU L2)")
+:  CacheMemory * directory;
+  // Convention: wire buffers are prefixed with "w_" for clarity
+  WireBuffer * w_reqToTCCDir;
+  WireBuffer * w_respToTCCDir;
+  WireBuffer * w_TCCUnblockToTCCDir;
+  WireBuffer * w_reqToTCC;
+  WireBuffer * w_probeToTCC;
+  WireBuffer * w_respToTCC;
+  int TCC_select_num_bits;
+  Cycles response_latency := 5;
+  Cycles directory_latency := 6;
+  Cycles issue_latency := 120;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="From", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromTCP, network="From", virtual_network="5", vnet_type="unblock";
+
+  // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+  MessageBuffer * probeToCore, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue, random="false";
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="TCCdir_State_I") {
+    // Base states
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Invalid, desc="Shared";
+    E, AccessPermission:Invalid, desc="Shared";
+    O, AccessPermission:Invalid, desc="Owner";
+    M, AccessPermission:Invalid, desc="Modified";
+
+    CP_I, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to invalid";
+    B_I, AccessPermission:Invalid, desc="Blocked, need not send data after acks are in, going to invalid";
+    CP_O, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to owned";
+    CP_S, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to shared";
+    CP_OM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to O_M";
+    CP_SM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to S_M";
+    CP_ISM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+    CP_IOM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+    CP_OSIW, AccessPermission:Invalid, desc="Blocked, must send data after acks+CancelWB are in, going to I_C";
+
+
+    // Transient states and busy states used for handling side (TCC-facing) interactions
+    BW_S, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_E, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_O, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+    BW_M, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+
+    // Transient states and busy states used for handling upward (TCP-facing) interactions
+    I_M, AccessPermission:Invalid, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_ES, AccessPermission:Invalid, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_S, AccessPermission:Invalid, desc="Invalid, issued RdBlkS, have not seen response yet";
+    BBS_S, AccessPermission:Invalid, desc="Blocked, going from S to S";
+    BBO_O, AccessPermission:Invalid, desc="Blocked, going from O to O";
+    BBM_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for data to forward";
+    BBM_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for data to forward";
+    BB_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for unblock";
+    BB_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for unblock";
+    BB_OO, AccessPermission:Invalid, desc="Blocked, going from O to O (adding sharers), waiting for unblock";
+    BB_S, AccessPermission:Invalid, desc="Blocked, going to S, waiting for (possible multiple) unblock(s)";
+    BBS_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+    BBO_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+    BBS_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+    BBO_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+    S_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+    O_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+
+    //
+    BBB_S, AccessPermission:Invalid, desc="Blocked, going to S after core unblock";
+    BBB_M, AccessPermission:Invalid, desc="Blocked, going to M after core unblock";
+    BBB_E, AccessPermission:Invalid, desc="Blocked, going to E after core unblock";
+
+    VES_I, AccessPermission:Invalid, desc="TCC replacement, waiting for clean WB ack";
+    VM_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+    VO_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+    VO_S, AccessPermission:Invalid, desc="TCC owner replacement, waiting for dirty WB ack";
+
+    ES_I, AccessPermission:Invalid, desc="L1 replacement, waiting for clean WB ack";
+    MO_I, AccessPermission:Invalid, desc="L1 replacement, waiting for dirty WB ack";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB for canceled WB";
+    I_W, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB; canceled WB raced with directory invalidation";
+
+    // Recall States
+    BRWD_I, AccessPermission:Invalid, desc="Recalling, waiting for WBAck and Probe Data responses";
+    BRW_I, AccessPermission:Read_Write, desc="Recalling, waiting for WBAck";
+    BRD_I, AccessPermission:Invalid, desc="Recalling, waiting for Probe Data responses";
+
+  }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+
+  // EVENTS
+  enumeration(Event, desc="TCC Directory Events") {
+    // Upward facing events (TCCdir w.r.t. TCP/SQC and TCC behaves like NBdir behaves with TCP/SQC and L3
+
+    // Directory Recall
+    Recall,              desc="directory cache is full";
+    // CPU requests
+    CPUWrite,           desc="Initial req from core, sent to TCC";
+    NoCPUWrite,           desc="Initial req from core, but non-exclusive clean data; can be discarded";
+    CPUWriteCancel,           desc="Initial req from core, sent to TCC";
+
+    // Requests from the TCPs
+    RdBlk,                  desc="RdBlk event";
+    RdBlkM,                 desc="RdBlkM event";
+    RdBlkS,                 desc="RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+
+    // TCC writebacks
+    VicDirty,           desc="...";
+    VicDirtyLast,           desc="...";
+    VicClean,           desc="...";
+    NoVic,           desc="...";
+    StaleVic,           desc="...";
+    CancelWB,           desc="TCC got invalidating probe, canceled WB";
+
+    // Probe Responses from TCP/SQCs
+    CPUPrbResp,     desc="Probe response from TCP/SQC";
+    TCCPrbResp,     desc="Probe response from TCC";
+
+    ProbeAcksComplete,	desc="All acks received";
+    ProbeAcksCompleteReissue,	desc="All acks received, changing CtoD to reissue";
+
+    CoreUnblock,		desc="unblock from TCP/SQC";
+    LastCoreUnblock,		desc="Last unblock from TCP/SQC";
+    TCCUnblock,			desc="unblock from TCC (current owner)";
+    TCCUnblock_Sharer,  desc="unblock from TCC (a sharer, not owner)";
+    TCCUnblock_NotValid,desc="unblock from TCC (not valid...caused by stale writebacks)";
+
+    // Downward facing events
+
+    // NB initiated
+    NB_AckS,        desc="NB Ack to TCC Request";
+    NB_AckE,        desc="NB Ack to TCC Request";
+    NB_AckM,        desc="NB Ack to TCC Request";
+    NB_AckCtoD,     desc="NB Ack to TCC Request";
+    NB_AckWB,       desc="NB Ack for clean WB";
+
+
+    // Incoming Probes from NB
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+  }
+
+
+  // TYPES
+
+  // Entry for directory
+  structure(Entry, desc="...", interface='AbstractCacheEntry') {
+    State CacheState,          desc="Cache state (Cache of directory entries)";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest Sharers,                   desc="Sharers for this block";
+    NetDest Owner,                     desc="Owner of this block";
+    NetDest MergedSharers,             desc="Read sharers who are merged on a request";
+    int WaitingUnblocks,           desc="Number of acks we're waiting for";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,    desc="Transient state";
+    DataBlock DataBlk, desc="DataBlk";
+    bool Dirty,        desc="Is the data dirty?";
+    MachineID Requestor, desc="requestor";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID UntransferredOwner,    desc = "Untransferred owner for an upgrade transaction";
+    bool UntransferredOwnerExists,    desc = "1 if Untransferred owner exists for an upgrade transaction";
+    bool Cached,        desc="data hit in Cache";
+    bool Shared,	desc="victim hit by shared probe";
+    bool Upgrade,	desc="An upgrade request in progress";
+    bool CtoD,	desc="Saved sysack info";
+    CoherenceState CohState, desc="Saved sysack info";
+    MessageSizeType MessageSize, desc="Saved sysack info";
+    MachineID Sender, desc="sender";
+  }
+
+  structure(TBETable, external = "yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // ** OBJECTS **
+  TBETable TBEs, template="<TCCdir_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  NetDest TCC_dir_subtree;
+  NetDest temp;
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+
+
+  bool presentOrAvail(Addr addr) {
+    return directory.isTagPresent(addr) || directory.cacheAvail(addr);
+  }
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", directory.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      assert(false);
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCCdir_State_to_permission(state));
+    }
+  }
+
+ AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCCdir_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCCdir_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+
+      if (state == State:S) {
+        assert(cache_entry.Owner.count() == 0);
+      }
+
+      if (state == State:O) {
+        assert(cache_entry.Owner.count() == 1);
+        assert(cache_entry.Sharers.isSuperset(cache_entry.Owner) == false);
+      }
+
+      if (state == State:M) {
+        assert(cache_entry.Owner.count() == 1);
+        assert(cache_entry.Sharers.count() == 0);
+      }
+
+      if (state == State:E) {
+        assert(cache_entry.Owner.count() == 0);
+        assert(cache_entry.Sharers.count() == 1);
+      }
+    }
+  }
+
+
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        directory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        directory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        directory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        directory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(probeToCore_out, TDProbeRequestMsg, probeToCore);
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  // Class 3: sideward facing ports (on "wirebuffer" links) to TCC
+  out_port(w_requestTCC_out, CPURequestMsg, w_reqToTCC);
+  out_port(w_probeTCC_out, NBProbeRequestMsg, w_probeToTCC);
+  out_port(w_respTCC_out, ResponseMsg, w_respToTCC);
+
+
+  // local trigger port
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  //
+  // request queue going to NB
+  //
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=8) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        assert(is_valid(tbe));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == false))  {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, cache_entry, tbe);
+        } else if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == true))  {
+          trigger(Event:ProbeAcksCompleteReissue, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  // Unblock Networks (TCCdir can receive unblocks from TCC, TCPs)
+  // Port on first (of three) wire buffers from TCC
+  in_port(w_TCCUnblock_in, UnblockMsg, w_TCCUnblockToTCCDir, rank=7) {
+    if (w_TCCUnblock_in.isReady(clockEdge())) {
+      peek(w_TCCUnblock_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.currentOwner) {
+            trigger(Event:TCCUnblock, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.valid) {
+            trigger(Event:TCCUnblock_Sharer, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:TCCUnblock_NotValid, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromTCP, rank=6) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if(cache_entry.WaitingUnblocks == 1) {
+          trigger(Event:LastCoreUnblock, in_msg.addr, cache_entry, tbe);
+        }
+        else {
+          trigger(Event:CoreUnblock, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  //Responses from TCC, and Cores
+  // Port on second (of three) wire buffers from TCC
+  in_port(w_TCCResponse_in, ResponseMsg, w_respToTCCDir, rank=5) {
+    if (w_TCCResponse_in.isReady(clockEdge())) {
+      peek(w_TCCResponse_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:TCCPrbResp, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseNetwork_in, ResponseMsg, responseFromTCP, rank=4) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  // Port on third (of three) wire buffers from TCC
+  in_port(w_TCCRequest_in, CPURequestMsg, w_reqToTCCDir, rank=3) {
+      if(w_TCCRequest_in.isReady(clockEdge())) {
+          peek(w_TCCRequest_in, CPURequestMsg) {
+              TBE tbe := TBEs.lookup(in_msg.addr);
+              Entry cache_entry := getCacheEntry(in_msg.addr);
+              if (in_msg.Type == CoherenceRequestType:WrCancel) {
+                  trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+              } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+                  if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+                      // if modified, or owner with no other sharers
+                      if ((cache_entry.CacheState == State:M) || (cache_entry.Sharers.count() == 0)) {
+                          assert(cache_entry.Owner.count()==1);
+                          trigger(Event:VicDirtyLast, in_msg.addr, cache_entry, tbe);
+                      } else {
+                          trigger(Event:VicDirty, in_msg.addr, cache_entry, tbe);
+                      }
+                  } else {
+                      trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+                  }
+              } else {
+                  if (in_msg.Type == CoherenceRequestType:VicClean) {
+                      if (is_valid(cache_entry) && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                          if (cache_entry.Sharers.count() == 1) {
+                              // Last copy, victimize to L3
+                              trigger(Event:VicClean, in_msg.addr, cache_entry, tbe);
+                          } else {
+                              // Either not the last copy or stall. No need to victimmize
+                              // remove sharer from sharer list
+                              assert(cache_entry.Sharers.count() > 1);
+                              trigger(Event:NoVic, in_msg.addr, cache_entry, tbe);
+                          }
+                      } else {
+                          trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+                      }
+                  }
+              }
+          }
+      }
+    }
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB, rank=2) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+            if (in_msg.CtoD) {
+              trigger(Event:NB_AckCtoD, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+            }
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB, rank=1) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (presentOrAvail(in_msg.addr)) {
+          if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+              if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+                  trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+              } else if(is_valid(cache_entry) && (cache_entry.Sharers.count() + cache_entry.Owner.count() ) >1) {
+                  trigger(Event:NoCPUWrite, in_msg.addr, cache_entry, tbe);
+              } else {
+                  trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+              }
+          } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+            trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+            trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+            trigger(Event:CPUWriteCancel, in_msg.addr, cache_entry, tbe);
+          }
+        } else {
+          // All requests require a directory entry
+          Addr victim := directory.cacheProbe(in_msg.addr);
+          trigger(Event:Recall, victim, getCacheEntry(victim), TBEs.lookup(victim));
+        }
+      }
+    }
+  }
+
+
+
+
+  // Actions
+
+  //Downward facing actions
+
+  action(c_clearOwner, "c", desc="Clear the owner field") {
+    cache_entry.Owner.clear();
+  }
+
+  action(rS_removeRequesterFromSharers, "rS", desc="Remove unblocker from sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.remove(in_msg.Sender);
+    }
+  }
+
+  action(rT_removeTCCFromSharers, "rT", desc="Remove  TCC from sharer list") {
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rO_removeOriginalRequestorFromSharers, "rO", desc="Remove replacing core from sharer list") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rC_removeCoreFromSharers, "rC", desc="Remove replacing core from sharer list") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rCo_removeCoreFromOwner, "rCo", desc="Remove replacing core from sharer list") {
+    // Note that under some cases this action will try to remove a stale owner
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.Owner.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rR_removeResponderFromSharers, "rR", desc="Remove responder from sharer list") {
+    peek(responseNetwork_in, ResponseMsg) {
+      cache_entry.Sharers.remove(in_msg.Sender);
+    }
+  }
+
+  action(nC_sendNullWBAckToCore, "nC", desc = "send a null WB Ack to release core") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBNack;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+   }
+  }
+
+  action(nT_sendNullWBAckToTCC, "nT", desc = "send a null WB Ack to release TCC") {
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      enqueue(w_respTCC_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+    }
+  }
+
+  action(eto_moveExSharerToOwner, "eto", desc="move the current exclusive sharer to owner") {
+      assert(cache_entry.Sharers.count() == 1);
+      assert(cache_entry.Owner.count() == 0);
+      cache_entry.Owner := cache_entry.Sharers;
+      cache_entry.Sharers.clear();
+      APPEND_TRANSITION_COMMENT(" new owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+  }
+
+  action(aT_addTCCToSharers, "aT", desc="Add TCC to sharer list") {
+    peek(w_TCCUnblock_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+    }
+  }
+
+  action(as_addToSharers, "as", desc="Add unblocker to sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+    }
+  }
+
+  action(c_moveOwnerToSharer, "cc", desc="Move owner to sharers") {
+    cache_entry.Sharers.addNetDest(cache_entry.Owner);
+    cache_entry.Owner.clear();
+  }
+
+  action(cc_clearSharers, "\c", desc="Clear the sharers field") {
+    cache_entry.Sharers.clear();
+  }
+
+  action(e_ownerIsUnblocker, "e", desc="The owner is now the unblocker") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Owner.clear();
+      cache_entry.Owner.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" tcp_ub owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(eT_ownerIsUnblocker, "eT", desc="TCC (unblocker) is now owner") {
+    peek(w_TCCUnblock_in, UnblockMsg) {
+      cache_entry.Owner.clear();
+      cache_entry.Owner.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" tcc_ub owner ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(ctr_copyTCCResponseToTBE, "ctr", desc="Copy TCC probe response data to TBE") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+      if(tbe.Dirty == false) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+        tbe.Sender := in_msg.Sender;
+      }
+      DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+    }
+  }
+
+  action(ccr_copyCoreResponseToTBE, "ccr", desc="Copy core probe response data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+      if(tbe.Dirty == false) {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+
+          if(tbe.Sender == machineID) {
+              tbe.Sender := in_msg.Sender;
+          }
+      }
+      DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+    }
+  }
+
+  action(cd_clearDirtyBitTBE, "cd", desc="Clear Dirty bit in TBE") {
+      tbe.Dirty := false;
+  }
+
+  action(n_issueRdBlk, "n-", desc="Issue RdBlk") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(rU_rememberUpgrade, "rU", desc="Remember that this was an upgrade") {
+      tbe.Upgrade := true;
+  }
+
+  action(ruo_rememberUntransferredOwner, "ruo", desc="Remember the untransferred owner") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if(in_msg.UntransferredOwner == true) {
+        tbe.UntransferredOwner := in_msg.Sender;
+        tbe.UntransferredOwnerExists := true;
+      }
+      DPRINTF(RubySlicc, "%s\n", (in_msg));
+    }
+  }
+
+  action(ruoT_rememberUntransferredOwnerTCC, "ruoT", desc="Remember the untransferred owner") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      if(in_msg.UntransferredOwner == true) {
+        tbe.UntransferredOwner := in_msg.Sender;
+        tbe.UntransferredOwnerExists := true;
+      }
+      DPRINTF(RubySlicc, "%s\n", (in_msg));
+    }
+  }
+
+ action(vd_victim, "vd", desc="Victimize M/O Data") {
+   enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+     out_msg.addr := address;
+     out_msg.Requestor := machineID;
+     out_msg.Destination.add(map_Address_to_Directory(address));
+     out_msg.MessageSize := MessageSizeType:Request_Control;
+     out_msg.Type := CoherenceRequestType:VicDirty;
+     if (cache_entry.CacheState == State:O) {
+       out_msg.Shared := true;
+     } else {
+       out_msg.Shared := false;
+     }
+     out_msg.Dirty := true;
+   }
+ }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Dirty := false;
+    }
+  }
+
+
+  action(sT_sendRequestToTCC, "sT", desc="send request to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(w_requestTCC_out, CPURequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+      }
+      APPEND_TRANSITION_COMMENT(" requestor ");
+      APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+
+    }
+  }
+
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+
+    temp := cache_entry.Sharers;
+    temp.addNetDest(cache_entry.Owner);
+    if (temp.isElement(tcc)) {
+        temp.remove(tcc);
+    }
+    if (temp.count() > 0) {
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := temp;
+        tbe.NumPendingAcks := temp.count();
+        if(cache_entry.CacheState == State:M) {
+            assert(tbe.NumPendingAcks == 1);
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+      }
+    }
+  }
+
+  action(ls2_probeShrL2Data, "ls2", desc="local probe downgrade L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbDowngrade;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(s2_probeShrL2Data, "s2", desc="probe shared L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbDowngrade;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(ldc_probeInvCoreData, "ldc", desc="local probe  to inv cores, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+        NetDest dest:= cache_entry.Sharers;
+        dest.addNetDest(cache_entry.Owner);
+        if(dest.isElement(tcc)){
+         dest.remove(tcc);
+        }
+        dest.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := dest.count();
+        if (dest.count()>0){
+        enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+
+        out_msg.Destination.addNetDest(dest);
+        if(cache_entry.CacheState == State:M) {
+		assert(tbe.NumPendingAcks == 1);
+        }
+
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+       }
+      }
+    }
+  }
+
+  action(ld2_probeInvL2Data, "ld2", desc="local probe inv L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe  inv cores + TCC, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := ProbeRequestType:PrbInv;
+      out_msg.ReturnData := true;
+      out_msg.MessageSize := MessageSizeType:Control;
+
+      out_msg.Destination.addNetDest(cache_entry.Sharers);
+      out_msg.Destination.addNetDest(cache_entry.Owner);
+      tbe.NumPendingAcks := cache_entry.Sharers.count() + cache_entry.Owner.count();
+      if(cache_entry.CacheState == State:M) {
+	  assert(tbe.NumPendingAcks == 1);
+      }
+      if (out_msg.Destination.isElement(tcc)) {
+          out_msg.Destination.remove(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+      }
+
+      DPRINTF(RubySlicc, "%s\n", (out_msg));
+    }
+  }
+
+  action(d2_probeInvL2Data, "d2", desc="probe inv L2, return data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := true;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(lpc_probeInvCore, "lpc", desc="local probe inv cores, no data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      TCC_dir_subtree.broadcast(MachineType:TCP);
+      TCC_dir_subtree.broadcast(MachineType:SQC);
+
+      temp := cache_entry.Sharers;
+      temp := temp.OR(cache_entry.Owner);
+      TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+      tbe.NumPendingAcks := TCC_dir_subtree.count();
+      if(cache_entry.CacheState == State:M) {
+	   assert(tbe.NumPendingAcks == 1);
+      }
+      if(TCC_dir_subtree.isElement(in_msg.Requestor)) {
+         TCC_dir_subtree.remove(in_msg.Requestor);
+         tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+      }
+
+      if(TCC_dir_subtree.count() > 0) {
+         enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+           out_msg.addr := address;
+           out_msg.Type := ProbeRequestType:PrbInv;
+           out_msg.ReturnData := false;
+           out_msg.MessageSize := MessageSizeType:Control;
+           out_msg.localCtoD := true;
+
+           out_msg.Destination.addNetDest(TCC_dir_subtree);
+
+           DPRINTF(RubySlicc, "%s\n", (out_msg));
+         }
+       }
+    }
+  }
+
+  action(ipc_probeInvCore, "ipc", desc="probe inv cores, no data") {
+    TCC_dir_subtree.broadcast(MachineType:TCP);
+    TCC_dir_subtree.broadcast(MachineType:SQC);
+
+    temp := cache_entry.Sharers;
+    temp := temp.OR(cache_entry.Owner);
+    TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+    tbe.NumPendingAcks := TCC_dir_subtree.count();
+    if(TCC_dir_subtree.count() > 0) {
+
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+
+        out_msg.Destination.addNetDest(TCC_dir_subtree);
+        if(cache_entry.CacheState == State:M) {
+          assert(tbe.NumPendingAcks == 1);
+        }
+
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+      }
+    }
+  }
+
+  action(i2_probeInvL2, "i2", desc="probe inv L2, no data") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          out_msg.addr := address;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.ReturnData := false;
+          out_msg.MessageSize := MessageSizeType:Control;
+          out_msg.Destination.add(tcc);
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(mc_cancelWB, "mc", desc="send writeback cancel to NB directory") {
+    enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+ action(sCS_sendCollectiveResponseS, "sCS", desc="send shared response to all merged TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Shared;
+        out_msg.Destination.addNetDest(cache_entry.MergedSharers);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+ action(sS_sendResponseS, "sS", desc="send shared response to TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Shared;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+ action(sM_sendResponseM, "sM", desc="send response to TCP/SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := tbe.Sender;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.CtoD := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.Dirty := tbe.Dirty;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+
+
+ action(fw2_forwardWBAck, "fw2", desc="forward WBAck to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      if(tbe.OriginalRequestor != machineID) {
+        enqueue(w_respTCC_out, ResponseMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Sender := machineID;
+          //out_msg.DataBlk := tbe.DataBlk;
+          out_msg.Destination.add(tbe.OriginalRequestor);
+          out_msg.MessageSize := in_msg.MessageSize;
+        }
+      }
+    }
+  }
+
+ action(sa_saveSysAck, "sa", desc="Save SysAck ") {
+    peek(responseFromNB_in, ResponseMsg) {
+        tbe.Dirty := in_msg.Dirty;
+        if (tbe.Dirty == false) {
+           tbe.DataBlk := in_msg.DataBlk;
+        }
+        else {
+           tbe.DataBlk := tbe.DataBlk;
+        }
+        tbe.CtoD := in_msg.CtoD;
+        tbe.CohState := in_msg.State;
+        tbe.Shared := in_msg.Shared;
+        tbe.MessageSize := in_msg.MessageSize;
+    }
+  }
+
+ action(fsa_forwardSavedAck, "fsa", desc="forward saved SysAck to TCP or SQC") {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        if (tbe.Dirty == false) {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        else {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        out_msg.CtoD := tbe.CtoD;
+        out_msg.State := tbe.CohState;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := tbe.Shared;
+        out_msg.MessageSize := tbe.MessageSize;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Sender := tbe.Sender;
+      }
+  }
+
+ action(fa_forwardSysAck, "fa", desc="forward SysAck to TCP or SQC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      enqueue(responseToCore_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        if (tbe.Dirty == false) {
+           out_msg.DataBlk := in_msg.DataBlk;
+           tbe.Sender := machineID;
+        }
+        else {
+           out_msg.DataBlk := tbe.DataBlk;
+        }
+        out_msg.CtoD := in_msg.CtoD;
+        out_msg.State := in_msg.State;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Sender := tbe.Sender;
+        DPRINTF(RubySlicc, "%s\n", (out_msg.DataBlk));
+      }
+    }
+  }
+
+ action(pso_probeSharedDataOwner, "pso", desc="probe shared data at owner") {
+    MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+                                       TCC_select_low_bit, TCC_select_num_bits);
+    if (cache_entry.Owner.isElement(tcc)) {
+      enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.add(tcc);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+    else { // i.e., owner is a core
+      enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.addNetDest(cache_entry.Owner);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+    tbe.NumPendingAcks := 1;
+  }
+
+  action(i_popIncomingRequestQueue, "i", desc="Pop incoming request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(j_popIncomingUnblockQueue, "j", desc="Pop incoming unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pk_popResponseQueue, "pk", desc="Pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="Pop incoming probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pR_popResponseFromNBQueue, "pR", desc="Pop incoming Response queue From NB") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pl_popTCCRequestQueue, "pl", desc="pop TCC request queue") {
+    w_TCCRequest_in.dequeue(clockEdge());
+  }
+
+  action(plr_popTCCResponseQueue, "plr", desc="pop TCC response queue") {
+    w_TCCResponse_in.dequeue(clockEdge());
+  }
+
+  action(plu_popTCCUnblockQueue, "plu", desc="pop TCC unblock queue") {
+    w_TCCUnblock_in.dequeue(clockEdge());
+  }
+
+
+  action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") {
+    peek(unblockNetwork_in, UnblockMsg) {
+      cache_entry.Sharers.add(in_msg.Sender);
+      cache_entry.MergedSharers.remove(in_msg.Sender);
+      assert(cache_entry.WaitingUnblocks >= 0);
+      cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks - 1;
+    }
+  }
+
+  action(q_addOutstandingMergedSharer, "q", desc="Increment outstanding requests") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.MergedSharers.add(in_msg.Requestor);
+      cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks + 1;
+    }
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockToNB_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(zz_recycleRequest, "\z", desc="Recycle the request queue") {
+    coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleTCCRequestQueue, "yy", desc="recycle yy request queue") {
+    w_TCCRequest_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xz_recycleResponseQueue, "xz", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xx_recycleTCCResponseQueue, "xx", desc="recycle TCC response queue") {
+    w_TCCResponse_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(vv_recycleTCCUnblockQueue, "vv", desc="Recycle the probe request queue") {
+    w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xy_recycleUnblockQueue, "xy", desc="Recycle the probe request queue") {
+    w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(ww_recycleProbeRequest, "ww", desc="Recycle the probe request queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+  }
+
+  action(o_checkForAckCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" tbe acks ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(tp_allocateTBE, "tp", desc="allocate TBE Entry for upward transactions") {
+    check_allocate(TBEs);
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Dirty := false;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+    }
+  }
+
+  action(tv_allocateTBE, "tv", desc="allocate TBE Entry for TCC transactions") {
+      check_allocate(TBEs);
+    peek(w_TCCRequest_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := in_msg.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+      check_allocate(TBEs);//check whether resources are full
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.Upgrade := false;
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+      tbe.Sender := machineID;
+    }
+  }
+
+  action(tr_allocateTBE, "tr", desc="allocate TBE Entry for recall") {
+      check_allocate(TBEs);//check whether resources are full
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.Upgrade := false;
+      tbe.OriginalRequestor := machineID; //Recall request, Self initiated
+      tbe.NumPendingAcks := 0;
+      tbe.UntransferredOwnerExists := false;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+
+  action(d_allocateDir, "d", desc="allocate Directory Cache") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(directory.allocate(address, new Entry));
+    }
+  }
+
+  action(dd_deallocateDir, "dd", desc="deallocate Directory Cache") {
+    if (is_valid(cache_entry)) {
+        directory.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+     enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+         out_msg.addr := address;
+         out_msg.Type := CoherenceResponseType:StaleNotif;
+         out_msg.Destination.add(map_Address_to_Directory(address));
+         out_msg.Sender := machineID;
+         out_msg.MessageSize := MessageSizeType:Response_Control;
+     }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(y_writeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (!tbe.Dirty || in_msg.Dirty) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(ty_writeTCCDataToTBE, "ty", desc="write TCC Probe Data to TBE") {
+    peek(w_TCCResponse_in, ResponseMsg) {
+      if (!tbe.Dirty || in_msg.Dirty) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    directory.setMRU(address);
+  }
+
+  // TRANSITIONS
+
+  // Handling TCP/SQC requests (similar to how NB dir handles TCC events with some changes to account for stateful directory).
+
+
+  // transitions from base
+  transition(I, RdBlk, I_ES){TagArrayRead} {
+    d_allocateDir;
+    t_allocateTBE;
+    n_issueRdBlk;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(I, RdBlkS, I_S){TagArrayRead} {
+    d_allocateDir;
+    t_allocateTBE;
+    nS_issueRdBlkS;
+    i_popIncomingRequestQueue;
+  }
+
+
+  transition(I_S, NB_AckS, BBB_S) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_ES, NB_AckS, BBB_S) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+ transition(I_ES, NB_AckE, BBB_E) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition({S_M, O_M}, {NB_AckCtoD,NB_AckM}, BBB_M) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_M, NB_AckM, BBB_M) {
+    fa_forwardSysAck;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BBB_M, CoreUnblock, M){TagArrayWrite} {
+    c_clearOwner;
+    cc_clearSharers;
+    e_ownerIsUnblocker;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BBB_S, CoreUnblock, S){TagArrayWrite}  {
+    as_addToSharers;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BBB_E, CoreUnblock, E){TagArrayWrite}  {
+    as_addToSharers;
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+
+  transition(I, RdBlkM, I_M){TagArrayRead}  {
+    d_allocateDir;
+    t_allocateTBE;
+    nM_issueRdBlkM;
+    i_popIncomingRequestQueue;
+  }
+
+  //
+  transition(S, {RdBlk, RdBlkS}, BBS_S){TagArrayRead} {
+    t_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Merging of read sharing into a single request
+  transition(BBS_S, {RdBlk, RdBlkS}) {
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Wait for probe acks to be complete
+  transition(BBS_S, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_S, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  // Window for merging complete with this transition
+  // Send responses to all outstanding
+  transition(BBS_S, ProbeAcksComplete, BB_S) {
+    sCS_sendCollectiveResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_S, CoreUnblock, BB_S) {
+    m_addUnlockerToSharers;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BB_S, LastCoreUnblock, S) {
+    m_addUnlockerToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(O, {RdBlk, RdBlkS}, BBO_O){TagArrayRead} {
+    t_allocateTBE;
+    pso_probeSharedDataOwner;
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+  // Merging of read sharing into a single request
+  transition(BBO_O, {RdBlk, RdBlkS}) {
+    q_addOutstandingMergedSharer;
+    i_popIncomingRequestQueue;
+  }
+
+  // Wait for probe acks to be complete
+  transition(BBO_O, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_O, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  // Window for merging complete with this transition
+  // Send responses to all outstanding
+  transition(BBO_O, ProbeAcksComplete, BB_OO) {
+    sCS_sendCollectiveResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_OO, CoreUnblock) {
+    m_addUnlockerToSharers;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(BB_OO, LastCoreUnblock, O){TagArrayWrite} {
+    m_addUnlockerToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(S, CPUWrite, BW_S){TagArrayRead} {
+    t_allocateTBE;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(E, CPUWrite, BW_E){TagArrayRead} {
+    t_allocateTBE;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(O, CPUWrite, BW_O){TagArrayRead} {
+    t_allocateTBE;
+    rCo_removeCoreFromOwner;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(M, CPUWrite, BW_M){TagArrayRead} {
+    t_allocateTBE;
+    rCo_removeCoreFromOwner;
+    rC_removeCoreFromSharers;
+    sT_sendRequestToTCC;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BW_S, TCCUnblock_Sharer, S){TagArrayWrite} {
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_S, TCCUnblock_NotValid, S){TagArrayWrite} {
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_E, TCCUnblock, E){TagArrayWrite} {
+    cc_clearSharers;
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_E, TCCUnblock_NotValid, E) {
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_M, TCCUnblock, M) {
+    c_clearOwner;
+    cc_clearSharers;
+    eT_ownerIsUnblocker;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_M, TCCUnblock_NotValid, M) {
+    // Note this transition should only be executed if we received a stale wb
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_O, TCCUnblock, O) {
+    c_clearOwner;
+    eT_ownerIsUnblocker;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition(BW_O, TCCUnblock_NotValid, O) {
+    // Note this transition should only be executed if we received a stale wb
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  // We lost the owner likely do to an invalidation racing with a 'O' wb
+  transition(BW_O, TCCUnblock_Sharer, S) {
+    c_clearOwner;
+    aT_addTCCToSharers;
+    dt_deallocateTBE;
+    plu_popTCCUnblockQueue;
+  }
+
+  transition({BW_M, BW_S, BW_E, BW_O}, {PrbInv,PrbInvData,PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  transition(BRWD_I, {PrbInvData, PrbInv, PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+  transition(S, CtoD, BBS_UM) {TagArrayRead} {
+    t_allocateTBE;
+    lpc_probeInvCore;
+    i2_probeInvL2;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBS_UM, CPUPrbResp, BBS_UM) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_UM, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBS_UM, ProbeAcksComplete, S_M) {
+    rU_rememberUpgrade;
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+  transition(O, CtoD, BBO_UM){TagArrayRead} {
+    t_allocateTBE;
+    lpc_probeInvCore;
+    i2_probeInvL2;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBO_UM, CPUPrbResp, BBO_UM) {
+    ruo_rememberUntransferredOwner;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_UM, TCCPrbResp) {
+    ruoT_rememberUntransferredOwnerTCC;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBO_UM, ProbeAcksComplete, O_M) {
+    rU_rememberUpgrade;
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  transition({S,E}, RdBlkM, BBS_M){TagArrayWrite} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBS_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    rR_removeResponderFromSharers;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBS_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBS_M, ProbeAcksComplete, S_M) {
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  transition(O, RdBlkM, BBO_M){TagArrayRead} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    o_checkForAckCompletion;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBO_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    rR_removeResponderFromSharers;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(BBO_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBO_M, ProbeAcksComplete, O_M) {
+    nM_issueRdBlkM;
+    pt_popTriggerQueue;
+  }
+
+  //
+  transition(M, RdBlkM, BBM_M){TagArrayRead} {
+    t_allocateTBE;
+    ldc_probeInvCoreData;
+    ld2_probeInvL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBM_M, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  // TCP recalled block before receiving probe
+  transition({BBM_M, BBS_M, BBO_M}, {CPUWrite,NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition(BBM_M, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BBM_M, ProbeAcksComplete, BB_M) {
+    sM_sendResponseM;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_M, CoreUnblock, M){TagArrayWrite} {
+    e_ownerIsUnblocker;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(M, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+    t_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(E, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+    t_allocateTBE;
+    eto_moveExSharerToOwner;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(BBM_O, CPUPrbResp) {
+    ccr_copyCoreResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  transition(BBM_O, TCCPrbResp) {
+    ctr_copyTCCResponseToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  transition(BBM_O, ProbeAcksComplete, BB_O) {
+    sS_sendResponseS;
+    pt_popTriggerQueue;
+  }
+
+  transition(BB_O, CoreUnblock, O){TagArrayWrite} {
+    as_addToSharers;
+    dt_deallocateTBE;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition({BBO_O, BBM_M, BBS_S, BBM_O, BB_M, BB_O, BB_S, BBO_UM, BBS_UM, BBS_M, BBO_M, BB_OO}, {PrbInvData, PrbInv,PrbShrData}) {
+    ww_recycleProbeRequest;
+  }
+
+  transition({BBM_O, BBS_S, CP_S, CP_O, CP_SM, CP_OM, BBO_O}, {CPUWrite,NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  // stale CtoD raced with external invalidation
+  transition({I, CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+  // stale CtoD raced with internal RdBlkM
+  transition({BBM_M, BBS_M, BBO_M, BBB_M, BBS_UM, BBO_UM}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+  transition({E, M}, CtoD) {
+    i_popIncomingRequestQueue;
+  }
+
+
+  // TCC-directory has sent out (And potentially received acks for) probes.
+  // TCP/SQC replacement (known to be stale subsequent) are popped off.
+  transition({BBO_UM, BBS_UM}, {CPUWrite,NoCPUWrite}) {
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition(S_M, {NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition(O_M, {NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+
+  transition({BBM_M, BBS_M, BBO_M, BBO_UM, BBS_UM}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({CP_S, CP_O, CP_OM, CP_SM}, {VicDirty, VicClean, VicDirtyLast, CancelWB, NoVic}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  // However, when TCCdir has sent out PrbSharedData, one cannot ignore.
+  transition({BBS_S, BBO_O, BBM_O, S_M, O_M, BBB_M, BBB_S, BBB_E}, {VicDirty, VicClean, VicDirtyLast,CancelWB}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BW_S,BW_E,BW_O, BW_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BW_S,BW_E,BW_O, BW_M}, CancelWB) {
+   nT_sendNullWBAckToTCC;
+   pl_popTCCRequestQueue;
+  }
+
+
+  /// recycle if waiting for unblocks.
+  transition({BB_M,BB_O,BB_S,BB_OO}, {VicDirty, VicClean, VicDirtyLast,NoVic,CancelWB}) {
+    yy_recycleTCCRequestQueue;
+  }
+
+  transition({BBS_S, BBO_O}, NoVic) {
+   rT_removeTCCFromSharers;
+   nT_sendNullWBAckToTCC;
+   pl_popTCCRequestQueue;
+  }
+
+  // stale. Pop message and send dummy ack.
+  transition({I_S, I_ES, I_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(M,  VicDirtyLast, VM_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(E,  VicDirty, VM_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(O, VicDirty, VO_S){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition(O, {VicDirtyLast, VicClean}, VO_I){TagArrayRead} {
+    tv_allocateTBE;
+    vd_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({E, S}, VicClean, VES_I){TagArrayRead} {
+    tv_allocateTBE;
+    vc_victim;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({O, S}, NoVic){TagArrayRead} {
+    rT_removeTCCFromSharers;
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  transition({O,S}, NoCPUWrite){TagArrayRead} {
+    rC_removeCoreFromSharers;
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({M,E}, NoCPUWrite){TagArrayRead} {
+    rC_removeCoreFromSharers;
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  // This can only happen if it is  race. (TCCdir sent out probes which caused this cancel in the first place.)
+  transition({VM_I, VES_I, VO_I}, CancelWB) {
+    pl_popTCCRequestQueue;
+  }
+
+  transition({VM_I, VES_I, VO_I}, NB_AckWB, I){TagArrayWrite} {
+    c_clearOwner;
+    cc_clearSharers;
+    wb_data;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(VO_S, NB_AckWB, S){TagArrayWrite}  {
+    c_clearOwner;
+    wb_data;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_C, NB_AckWB, I){TagArrayWrite}  {
+    c_clearOwner;
+    cc_clearSharers;
+    ss_sendStaleNotification;
+    fw2_forwardWBAck;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(I_W, NB_AckWB, I) {
+    ss_sendStaleNotification;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+
+
+  // Do not handle replacements, reads of any kind or writebacks from transients; recycle
+  transition({I_M, I_ES, I_S, MO_I, ES_I, S_M, O_M, VES_I, VO_I, VO_S, VM_I, I_C, I_W}, {RdBlkS,RdBlkM,RdBlk,CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition( VO_S, NoCPUWrite) {
+    zz_recycleRequest;
+  }
+
+  transition({BW_M, BW_S, BW_O, BW_E}, {RdBlkS,RdBlkM,RdBlk,CtoD,NoCPUWrite, CPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBB_M, BBB_S, BBB_E, BB_O, BB_M, BB_S, BB_OO}, { RdBlk, RdBlkS, RdBlkM, CPUWrite, NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBB_S, BBB_E, BB_O, BB_S, BB_OO}, { CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition({BBS_UM, BBO_UM, BBM_M, BBM_O, BBS_M, BBO_M}, { RdBlk, RdBlkS, RdBlkM}) {
+    zz_recycleRequest;
+  }
+
+  transition(BBM_O, CtoD) {
+    zz_recycleRequest;
+  }
+
+  transition({BBS_S, BBO_O}, {RdBlkM, CtoD}) {
+    zz_recycleRequest;
+  }
+
+  transition({B_I, CP_I, CP_S, CP_O, CP_OM, CP_SM, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {RdBlk, RdBlkS, RdBlkM}) {
+    zz_recycleRequest;
+  }
+
+  transition({CP_O, CP_S, CP_OM}, CtoD) {
+    zz_recycleRequest;
+  }
+
+  // Ignore replacement related messages after probe got in.
+  transition({CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {CPUWrite, NoCPUWrite}) {
+    zz_recycleRequest;
+  }
+
+  // Ignore replacement related messages after probes processed
+  transition({I, I_S, I_ES, I_M, I_C, I_W}, {CPUWrite,NoCPUWrite}) {
+      nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+  // cannot ignore cancel... otherwise TCP/SQC will be stuck in I_C
+  transition({I, I_S, I_ES, I_M, I_C, I_W, S_M, M, O, E, S}, CPUWriteCancel){TagArrayRead}  {
+    nC_sendNullWBAckToCore;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, {NoVic, VicClean, VicDirty, VicDirtyLast}){
+    nT_sendNullWBAckToTCC;
+    pl_popTCCRequestQueue;
+  }
+
+  // Handling Probes from NB (General process: (1) propagate up, go to blocking state (2) process acks (3) on last ack downward.)
+
+  // step 1
+  transition({M, O, E, S}, PrbInvData, CP_I){TagArrayRead} {
+    tp_allocateTBE;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_I, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_I, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_I, ProbeAcksComplete, I){TagArrayWrite} {
+    pd_sendProbeResponseData;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition({M, O, E, S}, PrbInv, B_I){TagArrayWrite} {
+    tp_allocateTBE;
+    ipc_probeInvCore;
+    i2_probeInvL2;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(B_I, CPUPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(B_I, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(B_I, ProbeAcksComplete, I){TagArrayWrite} {
+    // send response down to NB
+    pi_sendProbeResponseInv;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+
+  // step 1
+  transition({M, O}, PrbShrData, CP_O){TagArrayRead} {
+    tp_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+
+  transition(E, PrbShrData, CP_O){TagArrayRead} {
+    tp_allocateTBE;
+    eto_moveExSharerToOwner;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_O, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_O, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_O, ProbeAcksComplete, O){TagArrayWrite} {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  //step 1
+  transition(S, PrbShrData, CP_S) {
+    tp_allocateTBE;
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_S, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_S, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_S, ProbeAcksComplete, S) {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition(O_M, PrbInvData, CP_IOM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_IOM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_IOM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_IOM, ProbeAcksComplete, I_M) {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    cd_clearDirtyBitTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(CP_IOM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition(S_M, PrbInvData, CP_ISM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    o_checkForAckCompletion;
+    pp_popProbeQueue;
+  }
+  // step 2a
+  transition(CP_ISM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_ISM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_ISM, ProbeAcksComplete, I_M) {
+    pdm_sendProbeResponseDataMs;
+    c_clearOwner;
+    cc_clearSharers;
+    cd_clearDirtyBitTBE;
+
+    //dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+  transition(CP_ISM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+    pim_sendProbeResponseInvMs;
+    c_clearOwner;
+    cc_clearSharers;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // step 1
+  transition({S_M, O_M}, {PrbInv}, CP_ISM) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+  // next steps inherited from BS_ISM
+
+  // Simpler cases
+
+  transition({I_C, I_W}, {PrbInvData, PrbInv, PrbShrData}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  //If the directory is certain that the block is not present, one can send an acknowledgement right away.
+  // No need for three step process.
+  transition(I, {PrbInv,PrbShrData,PrbInvData}){TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES, I_S}, {PrbInv, PrbInvData}) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES, I_S}, PrbShrData) {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  //step 1
+  transition(S_M, PrbShrData, CP_SM) {
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    o_checkForAckCompletion;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_SM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_SM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_SM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, S_M){DataArrayRead} {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    pt_popTriggerQueue;
+  }
+
+  //step 1
+  transition(O_M, PrbShrData, CP_OM) {
+    sc_probeShrCoreData;
+    s2_probeShrL2Data;
+    pp_popProbeQueue;
+  }
+  // step 2
+  transition(CP_OM, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+  // step 2b
+  transition(CP_OM, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  // step 3
+  transition(CP_OM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, O_M) {
+    // send response down to NB
+    pd_sendProbeResponseData;
+    pt_popTriggerQueue;
+  }
+
+  transition(BRW_I, PrbInvData, I_W) {
+     pd_sendProbeResponseData;
+     pp_popProbeQueue;
+   }
+
+  transition({VM_I,VO_I}, PrbInvData, I_C) {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(VES_I, {PrbInvData,PrbInv}, I_C) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({VM_I, VO_I, BRW_I}, PrbInv, I_W) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({VM_I, VO_I, VO_S, VES_I, BRW_I}, PrbShrData) {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(VO_S, PrbInvData, CP_OSIW) {
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+    pp_popProbeQueue;
+  }
+
+  transition(CP_OSIW, TCCPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+  transition(CP_OSIW, CPUPrbResp) {
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition(CP_OSIW, ProbeAcksComplete, I_C) {
+    pd_sendProbeResponseData;
+    cd_clearDirtyBitTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition({I, S, E, O, M, CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W}, StaleVic) {
+      nT_sendNullWBAckToTCC;
+      pl_popTCCRequestQueue;
+  }
+
+  transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, StaleVic) {
+      nT_sendNullWBAckToTCC;
+      pl_popTCCRequestQueue;
+  }
+
+  // Recall Transistions
+  // transient states still require the directory state
+  transition({M, O}, Recall, BRWD_I) {
+    tr_allocateTBE;
+    vd_victim;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+  }
+
+  transition({E, S}, Recall, BRWD_I) {
+    tr_allocateTBE;
+    vc_victim;
+    dc_probeInvCoreData;
+    d2_probeInvL2Data;
+  }
+
+  transition(I, Recall) {
+    dd_deallocateDir;
+  }
+
+  transition({BRWD_I, BRD_I}, CPUPrbResp) {
+    y_writeDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    pk_popResponseQueue;
+  }
+
+  transition({BRWD_I, BRD_I}, TCCPrbResp) {
+    ty_writeTCCDataToTBE;
+    x_decrementAcks;
+    o_checkForAckCompletion;
+    plr_popTCCResponseQueue;
+  }
+
+  transition(BRWD_I, NB_AckWB, BRD_I) {
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BRWD_I, ProbeAcksComplete, BRW_I) {
+    pt_popTriggerQueue;
+  }
+
+  transition(BRW_I, NB_AckWB, I) {
+    wb_data;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pR_popResponseFromNBQueue;
+  }
+
+  transition(BRD_I, ProbeAcksComplete, I) {
+    wb_data;
+    dt_deallocateTBE;
+    dd_deallocateDir;
+    pt_popTriggerQueue;
+  }
+
+  // wait for stable state for Recall
+  transition({BRWD_I,BRD_I,BRW_I,CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W, CP_I}, Recall) {
+    zz_recycleRequest; // stall and wait would be for the wrong address
+    ut_updateTag; // try to find an easier recall
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCP.sm b/src/mem/protocol/GPU_RfO-TCP.sm
new file mode 100644
index 000000000..6cf9224a6
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCP.sm
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : GPUCoalescer* coalescer;
+   Sequencer* sequencer;
+   bool use_seq_not_coal;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 40;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E, AccessPermission:Read_Write, desc="Exclusive";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    M, AccessPermission:Read_Write, desc="Modified";
+
+    I_M, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    S_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+
+    ES_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+    MO_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for dirty WB ack";
+
+    MO_PI, AccessPermission:Read_Only, desc="L1 downgrade, waiting for CtoD ack (or ProbeInvalidateData)";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCC for canceled WB";
+  }
+
+  enumeration(Event, desc="TCP Events") {
+    // Core initiated
+    Load,           desc="Load";
+    Store,          desc="Store";
+
+    // TCC initiated
+    TCC_AckS,        desc="TCC Ack to Core Request";
+    TCC_AckE,        desc="TCC Ack to Core Request";
+    TCC_AckM,        desc="TCC Ack to Core Request";
+    TCC_AckCtoD,     desc="TCC Ack to Core Request";
+    TCC_AckWB,       desc="TCC Ack for clean WB";
+    TCC_NackWB,       desc="TCC Nack for clean WB";
+
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInv,             desc="probe, no need for data";
+    LocalPrbInv,             desc="local probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCP_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCP_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCP_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  MachineType getCoherenceType(MachineID myMachID,
+                                      MachineID senderMachID) {
+    if(myMachID == senderMachID) {
+        return MachineType:TCP;
+    } else if(machineIDToMachineType(senderMachID) == MachineType:TCP) {
+        return MachineType:L1Cache_wCC;
+    } else if(machineIDToMachineType(senderMachID) == MachineType:TCC) {
+        return MachineType:TCC;
+    } else {
+        return MachineType:TCCdir;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+  out_port(responseNetwork_out, ResponseMsg, responseFromTCP);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // In Ports
+
+  in_port(probeNetwork_in, TDProbeRequestMsg, probeToTCP) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+     peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            if(in_msg.localCtoD) {
+              trigger(Event:LocalPrbInv, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+            }
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+    if (responseToTCP_in.isReady(clockEdge())) {
+      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+            if (in_msg.CtoD) {
+              trigger(Event:TCC_AckCtoD, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:TCC_AckM, in_msg.addr, cache_entry, tbe);
+            }
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:TCC_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+          trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+          trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:LD) {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Dirty := cache_entry.Dirty;
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToTCP_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+        sequencer.readCallback(address, cache_entry.DataBlk,
+                               false, MachineType:TCP);
+    } else {
+        coalescer.readCallback(address, MachineType:TCP, cache_entry.DataBlk);
+    }
+  }
+
+  action(xl_loadDone, "xl", desc="remote load done") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      if (use_seq_not_coal) {
+        coalescer.recordCPReadCallBack(machineID, in_msg.Sender);
+        sequencer.readCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      } else {
+        MachineType cc_mach_type := getCoherenceType(machineID,
+                                                            in_msg.Sender);
+        coalescer.readCallback(address,
+                               cc_mach_type,
+                               cache_entry.DataBlk,
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      }
+    }
+  }
+
+  action(s_storeDone, "s", desc="local store done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+      coalescer.recordCPWriteCallBack(machineID, machineID);
+      sequencer.writeCallback(address, cache_entry.DataBlk,
+                              false, MachineType:TCP);
+    } else {
+      coalescer.writeCallback(address, MachineType:TCP, cache_entry.DataBlk);
+    }
+    cache_entry.Dirty := true;
+  }
+
+  action(xs_storeDone, "xs", desc="remote store done") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      if (use_seq_not_coal) {
+        coalescer.recordCPWriteCallBack(machineID, in_msg.Sender);
+        sequencer.writeCallback(address,
+                                cache_entry.DataBlk,
+                                false,
+                                machineIDToMachineType(in_msg.Sender),
+                                in_msg.InitialRequestTime,
+                                in_msg.ForwardRequestTime,
+                                in_msg.ProbeRequestStartTime);
+      } else {
+        MachineType cc_mach_type := getCoherenceType(machineID,
+                                                            in_msg.Sender);
+        coalescer.writeCallback(address,
+                                cc_mach_type,
+                                cache_entry.DataBlk,
+                                in_msg.InitialRequestTime,
+                                in_msg.ForwardRequestTime,
+                                in_msg.ProbeRequestStartTime);
+      }
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToTCP_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToTCP_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(piu_sendProbeResponseInvUntransferredOwnership, "piu", desc="send probe ack inv, no data, retain ownership") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      // will this always be ok? probably not for multisocket
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.UntransferredOwner :=true;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and TCC respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+      APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+      APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry) || is_valid(tbe));
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.DataBlk := getDataBlock(address);
+      if (is_valid(tbe)) {
+        out_msg.Dirty := tbe.Dirty;
+      } else {
+        out_msg.Dirty := cache_entry.Dirty;
+      }
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+      APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+      APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+      DPRINTF(RubySlicc, "Data is %s\n", out_msg.DataBlk);
+    }
+  }
+
+  action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+    L1cache.setMRU(address);
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.wasValid := isValid(address);
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition(I, Load, I_ES) {TagArrayRead} {
+    a_allocate;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Store, I_M) {TagArrayRead, TagArrayWrite} {
+    a_allocate;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Store, S_M) {TagArrayRead} {
+    mru_updateMRU;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(E, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, Store, O_M) {TagArrayRead, DataArrayWrite} {
+    mru_updateMRU;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(M, Store) {TagArrayRead, DataArrayWrite} {
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition({S, E, O, M}, Load) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    mru_updateMRU;
+    p_popMandatoryQueue;
+  }
+
+  // recycles from transients
+  transition({I_M, I_ES, ES_I, MO_I, S_M, O_M, MO_PI, I_C}, {Load, Store, Repl}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S, E}, Repl, ES_I) {TagArrayRead} {
+    t_allocateTBE;
+    vc_victim;
+    ic_invCache;
+  }
+
+  transition({O, M}, Repl, MO_I) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    vd_victim;
+    ic_invCache;
+  }
+
+  // TD event transitions
+  transition(I_M, {TCC_AckM, TCC_AckCtoD}, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    w_writeCache;
+    xs_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, TCC_AckS, S) {TagArrayWrite,  DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, TCC_AckE, E) {TagArrayWrite,  DataArrayWrite} {
+    w_writeCache;
+    xl_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M, O_M}, TCC_AckM, M) {TagArrayWrite, DataArrayWrite} {
+    xs_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({MO_I, ES_I}, TCC_NackWB, I){TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({MO_I, ES_I}, TCC_AckWB, I) {TagArrayWrite, DataArrayRead} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_AckWB, I) {TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // Probe transitions
+  transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbInvData) {TagArrayRead, TagArrayWrite} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  // Needed for TCC-based protocols. Must hold on to ownership till transfer complete
+  transition({M, O}, LocalPrbInv, MO_PI){TagArrayRead, TagArrayWrite} {
+    piu_sendProbeResponseInvUntransferredOwnership;
+    pp_popProbeQueue;
+  }
+
+  // If there is a race and we see a probe invalidate, handle normally.
+  transition(MO_PI, PrbInvData, I){TagArrayWrite} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_PI, PrbInv, I){TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  // normal exit when ownership is successfully transferred
+  transition(MO_PI, TCC_AckCtoD, I) {TagArrayWrite} {
+    ic_invCache;
+    pr_popResponseQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I)  {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S, I}, LocalPrbInv, I){TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+
+  transition({M, E, O}, PrbShrData, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_PI, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+
+  transition(S, PrbShrData, S) {TagArrayRead, DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {TagArrayRead} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES}, {PrbInv, PrbInvData}){TagArrayRead} {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    a_allocate;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M, I_ES}, PrbShrData) {} {
+    prm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M, PrbInvData, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M, PrbInvData, I_M) {TagArrayRead,DataArrayRead} {
+    pdm_sendProbeResponseDataMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M, O_M}, {PrbInv}, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M, {LocalPrbInv}, I_M) {TagArrayRead} {
+    pim_sendProbeResponseInvMs;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M, LocalPrbInv, I_M) {TagArrayRead} {
+    piu_sendProbeResponseInvUntransferredOwnership;
+    ic_invCache;
+    a_allocate;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M, O_M}, PrbShrData) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C){
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {
+    pi_sendProbeResponseInv;
+    ic_invCache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbShrData, ES_I) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData, MO_I) {DataArrayRead} {
+    pd_sendProbeResponseData;
+    sf_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+}
diff --git a/src/mem/protocol/GPU_RfO.slicc b/src/mem/protocol/GPU_RfO.slicc
new file mode 100644
index 000000000..7773ce6e0
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_RfO-TCP.sm";
+include "GPU_RfO-SQC.sm";
+include "GPU_RfO-TCC.sm";
+include "GPU_RfO-TCCdir.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/GPU_VIPER-SQC.sm b/src/mem/protocol/GPU_VIPER-SQC.sm
new file mode 100644
index 000000000..8d5b5699a
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-SQC.sm
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+   CacheMemory * L1cache;
+   int TCC_select_num_bits;
+   Cycles issue_latency := 80;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18; // for 1MB L2, 20 for 2MB
+
+  MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+
+  MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+{
+  state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    V, AccessPermission:Read_Only, desc="Valid";
+  }
+
+  enumeration(Event, desc="SQC Events") {
+    // Core initiated
+    Fetch,          desc="Fetch";
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+    Data,           desc="Received Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Tick clockEdge();
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return SQC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return SQC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(SQC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+
+  // In Ports
+
+  in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+    if (responseToSQC_in.isReady(clockEdge())) {
+      peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.addr);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        assert(in_msg.Type == RubyRequestType:IFETCH);
+        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToSQC_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+  }
+
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToSQC_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := false;
+    }
+  }
+
+  // Transitions
+
+  // transitions from base
+  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+    ic_invCache
+  }
+
+  transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    a_allocate;
+    w_writeCache
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I, Fetch) {TagArrayRead, TagArrayWrite} {
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // simple hit transitions
+  transition(V, Fetch) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    p_popMandatoryQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm
new file mode 100644
index 000000000..f62df9f4f
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCC.sm
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   bool WB; /*is this cache Writeback?*/
+   Cycles l2_request_latency := 50;
+   Cycles l2_response_latency := 20;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+  // To the Cores. TCC deals only with TCPs/SQCs.
+  MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue;
+
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="RdBlk event";
+    WrVicBlk,               desc="L1 Write Through";
+    WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    Atomic,                 desc="Atomic Op";
+    AtomicDone,             desc="AtomicOps Complete";
+    AtomicNotDone,          desc="AtomicOps not Complete";
+    Data,                   desc="data messgae";
+    // Coming from this TCC
+    L2_Repl,                desc="L2 Replacement";
+    // Probes
+    PrbInv,                 desc="Invalidating probe";
+    // Coming from Memory Controller
+    WBAck,                  desc="writethrough ack from memory";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+    W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+    V, AccessPermission:Read_Only,  desc="Valid";
+    I, AccessPermission:Invalid,    desc="Invalid";
+    IV, AccessPermission:Busy,      desc="Waiting for Data";
+    WI, AccessPermission:Busy,      desc="Waiting on Writethrough Ack";
+    A, AccessPermission:Busy,       desc="Invalid waiting on atomici Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+    WriteMask writeMask,        desc="Dirty byte mask";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+    NetDest Destination, desc="Data destination";
+    int numAtomics,     desc="number remaining atomics";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  //
+  // request queue going to NB
+  //
+
+
+// ** IN_PORTS **
+  in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (tbe.numAtomics == 0) {
+            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if(presentOrAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+      }
+    }
+  }
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            if(WB) {
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+                } else {
+                    Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+                    trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+                }
+            } else {
+                trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(sd_sendData, "sd", desc="send Shared response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+    enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:TDSysResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination := tbe.Destination;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+
+  action(rd_requestData, "r", desc="Miss in L2, pass on") {
+    if(tbe.Destination.count()==1){
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+          out_msg.addr := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(map_Address_to_Directory(address));
+          out_msg.Shared := false; // unneeded for this request
+          out_msg.MessageSize := in_msg.MessageSize;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Destination.clear();
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+    }
+  }
+
+  action(swb_sendWBAck, "swb", desc="send WB Ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.DataBlk := in_msg.DataBlk;
+        }
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+      cache_entry.writeMask.clear();
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    if (is_invalid(tbe)) {
+      check_allocate(TBEs);
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Destination.clear();
+      tbe.numAtomics := 0;
+    }
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+          tbe.Destination.add(in_msg.Requestor);
+        }
+      }
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    tbe.Destination.clear();
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(wb_writeBack, "wb", desc="write back data") {
+    enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.WTRequestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.Dirty := true;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.Dirty := true;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(z_stall, "z", desc="stall") {
+      // built-in
+  }
+
+
+  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics + 1;
+  }
+
+
+  action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics - 1;
+    if (tbe.numAtomics==0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AtomicDone;
+      }
+    }
+  }
+
+  action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+  // transitions from base
+  // Assumptions for ArrayRead/Write
+  // TBE checked before tags
+  // Data Read/Write requires Tag Read
+
+  // Stalling transitions do NOT check the tag array...and if they do,
+  // they can cause a resource stall deadlock!
+
+  transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+      z_stall;
+  }
+  transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+    sd_sendData;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, RdBlk, IV) {TagArrayRead} {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition(IV, RdBlk) {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition({V, I},Atomic, A) {TagArrayRead} {
+    i_invL2;
+    t_allocateTBE;
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition(A, Atomic) {
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition({M, W}, Atomic, WI) {TagArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, WrVicBlk) {TagArrayRead} {
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+  }
+
+  transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+  }
+
+  transition({A, IV, WI}, L2_Repl) {
+    i_invL2;
+  }
+
+  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(W, PrbInv) {TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({A, IV, WI}, PrbInv) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    wcb_writeCacheBlock;
+    sdr_sendDataResponse;
+    pr_popResponseQueue;
+    dt_deallocateTBE;
+  }
+
+  transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ar_sendAtomicResponse;
+    dna_decrementNumAtomics;
+    pr_popResponseQueue;
+  }
+
+  transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+    dt_deallocateTBE;
+    ptr_popTriggerQueue;
+  }
+
+  transition(A, AtomicNotDone) {TagArrayRead} {
+    ptr_popTriggerQueue;
+  }
+
+  //M,W should not see WBAck as the cache is in WB mode
+  //WBAcks do not need to check tags
+  transition({I, V, IV, A}, WBAck) {
+    w_sendResponseWBAck;
+    pr_popResponseQueue;
+  }
+
+  transition(WI, WBAck,I) {
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCP.sm b/src/mem/protocol/GPU_VIPER-TCP.sm
new file mode 100644
index 000000000..d81196b17
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCP.sm
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : VIPERCoalescer* coalescer;
+   Sequencer* sequencer;
+   bool use_seq_not_coal;
+   CacheMemory * L1cache;
+   bool WB; /*is this cache Writeback?*/
+   bool disableL1; /* bypass L1 cache? */
+   int TCC_select_num_bits;
+   Cycles issue_latency := 40;  // time to send data down to TCC
+   Cycles l2_hit_latency := 18;
+
+  MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+  MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+  MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+  MessageBuffer * mandatoryQueue;
+
+{
+  state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+    I, AccessPermission:Invalid, desc="Invalid";
+    V, AccessPermission:Read_Only, desc="Valid";
+    W, AccessPermission:Read_Write, desc="Written";
+    M, AccessPermission:Read_Write, desc="Written and Valid";
+    L, AccessPermission:Read_Write, desc="Local access is modifable";
+    A, AccessPermission:Invalid, desc="Waiting on Atomic";
+  }
+
+  enumeration(Event, desc="TCP Events") {
+    // Core initiated
+    Load,           desc="Load";
+    Store,          desc="Store to L1 (L1 is dirty)";
+    StoreThrough,   desc="Store directly to L2(L1 is clean)";
+    StoreLocal,     desc="Store to L1 but L1 is clean";
+    Atomic,         desc="Atomic";
+    Flush,          desc="Flush if dirty(wbL1 for Store Release)";
+    Evict,          desc="Evict if clean(invL1 for Load Acquire)";
+    // Mem sys initiated
+    Repl,           desc="Replacing block from cache";
+
+    // TCC initiated
+    TCC_Ack,        desc="TCC Ack to Core Request";
+    TCC_AckWB,      desc="TCC Ack for WB";
+    // Disable L1 cache
+    Bypass,         desc="Bypass the entire L1 cache";
+ }
+
+  enumeration(RequestType,
+              desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
+  }
+
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+    WriteMask writeMask, desc="written bytes masks";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,    desc="Transient state";
+    DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+    bool Dirty,        desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,       desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+  int WTcnt, default="0";
+  int Fcnt, default="0";
+  bool inFlush, default="false";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // Internal functions
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+    return cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCP_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCP_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCP_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayFlash) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+        L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayFlash) {
+      // FIXME should check once per cache, rather than once per cacheline
+      return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // Out Ports
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+
+  // In Ports
+
+  in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+    if (responseToTCP_in.isReady(clockEdge())) {
+      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+          // disable L1 cache
+          if (disableL1) {
+	    trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+          } else {
+            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+              trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
+            } else {
+              Addr victim := L1cache.cacheProbe(in_msg.addr);
+              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+            }
+          }
+        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
+                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Unexpected Response Message to Core");
+          }
+      }
+    }
+  }
+
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:LD) {
+          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:ATOMIC) {
+          trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:ST) {
+          if(disableL1) {
+            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+          } else {
+            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+              if (in_msg.segment == HSASegment:SPILL) {
+                trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
+              } else if (WB) {
+                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+              } else {
+                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+              }
+            } else {
+              Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+              trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+            }
+          } // end if (disableL1)
+        } else if (in_msg.Type == RubyRequestType:FLUSH) {
+            trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
+            trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          error("Unexpected Request Message from VIC");
+          if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+            if (WB) {
+                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+            trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(ic_invCache, "ic", desc="invalidate cache") {
+    if(is_valid(cache_entry)) {
+      cache_entry.writeMask.clear();
+      L1cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(rb_bypassDone, "rb", desc="bypass L1 of read access") {
+    peek(responseToTCP_in, ResponseMsg) {
+      DataBlock tmp:= in_msg.DataBlk;
+      if (use_seq_not_coal) {
+        sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+        coalescer.readCallback(address, MachineType:L1Cache, tmp);
+      }
+      if(is_valid(cache_entry)) {
+        unset_cache_entry();
+      }
+    }
+  }
+
+  action(wab_bypassDone, "wab", desc="bypass L1 of write access") {
+    peek(responseToTCP_in, ResponseMsg) {
+      DataBlock tmp := in_msg.DataBlk;
+      if (use_seq_not_coal) {
+        sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
+  action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") {
+    peek(mandatoryQueue_in, RubyRequest){
+      if (cache_entry.writeMask.cmpMask(in_msg.writeMask)) {
+          if (use_seq_not_coal) {
+            sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+          } else {
+            coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+          }
+      } else {
+        enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceRequestType:RdBlk;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.InitialRequestTime := curCycle();
+        }
+      }
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="Flush dirty data") {
+    WTcnt := WTcnt + 1;
+    APPEND_TRANSITION_COMMENT("write++ = ");
+    APPEND_TRANSITION_COMMENT(WTcnt);
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="send Atomic") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.writeMask.clear();
+        out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a_allocate, "a", desc="allocate block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L1cache.allocate(address, new Entry));
+    }
+    cache_entry.writeMask.clear();
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(sf_setFlush, "sf", desc="set flush") {
+    inFlush := true;
+    APPEND_TRANSITION_COMMENT(" inFlush is true");
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToTCP_in.dequeue(clockEdge());
+  }
+
+  action(l_loadDone, "l", desc="local load done") {
+    assert(is_valid(cache_entry));
+    if (use_seq_not_coal) {
+      sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+      coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
+  action(s_storeDone, "s", desc="local store done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+      sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+      coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+    cache_entry.Dirty := true;
+  }
+
+  action(inv_invDone, "inv", desc="local inv done") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invCallback(address);
+    }
+  }
+
+  action(wb_wbDone, "wb", desc="local wb done") {
+    if (inFlush == true) {
+      Fcnt := Fcnt + 1;
+      if (Fcnt > WTcnt) {
+        if (use_seq_not_coal) {
+            DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
+            assert(false);
+        } else {
+          coalescer.wbCallback(address);
+        }
+        Fcnt := Fcnt - 1;
+      }
+      if (WTcnt == 0 && Fcnt == 0) {
+        inFlush := false;
+        APPEND_TRANSITION_COMMENT(" inFlush is false");
+      }
+    }
+  }
+
+  action(wd_wtDone, "wd", desc="writethrough done") {
+    WTcnt := WTcnt - 1;
+    if (inFlush == true) {
+      Fcnt := Fcnt -1;
+    }
+    assert(WTcnt >= 0);
+    APPEND_TRANSITION_COMMENT("write-- = ");
+    APPEND_TRANSITION_COMMENT(WTcnt);
+  }
+
+  action(dw_dirtyWrite, "dw", desc="update write mask"){
+    peek(mandatoryQueue_in, RubyRequest) {
+      cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+    }
+  }
+  action(w_writeCache, "w", desc="write data to cache") {
+    peek(responseToTCP_in, ResponseMsg) {
+      assert(is_valid(cache_entry));
+      DataBlock tmp := in_msg.DataBlk;
+      tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask);
+      cache_entry.DataBlk := tmp;
+    }
+  }
+
+  action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+    L1cache.setMRU(address);
+  }
+
+//  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+//    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+//  }
+
+  action(z_stall, "z", desc="stall; built-in") {
+      // built-int action
+  }
+
+  // Transitions
+  // ArrayRead/Write assumptions:
+  // All requests read Tag Array
+  // TBE allocation write the TagArray to I
+  // TBE only checked on misses
+  // Stores will also write dirty bits in the tag
+  // WriteThroughs still need to use cache entry as staging buffer for wavefront
+
+  // Stalling transitions do NOT check the tag array...and if they do,
+  // they can cause a resource stall deadlock!
+
+  transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
+      z_stall;
+  }
+
+  transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
+    l_loadDone;
+    mru_updateMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Load) {TagArrayRead} {
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    mru_updateMRU;
+    at_atomicThrough;
+    p_popMandatoryQueue;
+  }
+
+  transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+    wt_writeThrough;
+    t_allocateTBE;
+    at_atomicThrough;
+    ic_invCache;
+  }
+
+  transition(W, Load, I) {TagArrayRead, DataArrayRead} {
+    wt_writeThrough;
+    norl_issueRdBlkOrloadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    mru_updateMRU;
+    s_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  //M,W should not see storeThrough
+  transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocate;
+    dw_dirtyWrite;
+    s_storeDone;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    dw_dirtyWrite;
+    s_storeDone;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+    a_allocate;
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+  }
+
+  transition(A, Bypass, I){
+    d_deallocateTBE;
+    wab_bypassDone;
+    pr_popResponseQueue;
+  }
+
+  transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+    d_deallocateTBE;
+    a_allocate;
+    w_writeCache;
+    s_storeDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
+
+  transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+    w_writeCache;
+    l_loadDone;
+    pr_popResponseQueue;
+  }
+
+  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+    ic_invCache;
+  }
+
+  transition({A}, Repl) {TagArrayRead, TagArrayWrite} {
+    ic_invCache;
+  }
+
+  transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    wt_writeThrough;
+    ic_invCache;
+  }
+
+  transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    wt_writeThrough;
+    ic_invCache;
+  }
+
+  transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    sf_setFlush;
+    wt_writeThrough;
+    ic_invCache;
+    p_popMandatoryQueue;
+  }
+
+  transition({V, I, A, L},Flush) {TagArrayFlash} {
+    sf_setFlush;
+    wb_wbDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({I, V}, Evict, I) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+    ic_invCache;
+  }
+
+  transition({W, M}, Evict, W) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({A, L}, Evict) {TagArrayFlash} {
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
+  // TCC_AckWB only snoops TBE
+  transition({V, I, A, M, W, L}, TCC_AckWB) {
+    wd_wtDone;
+    wb_wbDone;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER.slicc b/src/mem/protocol/GPU_VIPER.slicc
new file mode 100644
index 000000000..45f7f3477
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Baseline.slicc b/src/mem/protocol/GPU_VIPER_Baseline.slicc
new file mode 100644
index 000000000..49bdce38c
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Baseline.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-probeFilter.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Region-TCC.sm b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
new file mode 100644
index 000000000..c3aef15a3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor, Blake Hechtman
+ */
+
+/*
+ * This file is inherited from GPU_VIPER-TCC.sm and retains its structure.
+ * There are very few modifications in this file from the original VIPER TCC
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+   bool WB; /*is this cache Writeback?*/
+   int regionBufferNum;
+   Cycles l2_request_latency := 50;
+   Cycles l2_response_latency := 20;
+
+  // From the TCPs or SQCs
+  MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request";
+  // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+  MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response";
+  // From the NB
+  MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response";
+  // To the NB
+  MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue, ordered="true", random="false";
+{
+  // EVENTS
+  enumeration(Event, desc="TCC Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="RdBlk event";
+    WrVicBlk,               desc="L1 Write Through";
+    WrVicBlkBack,           desc="L1 Write Back(dirty cache)";
+    Atomic,                 desc="Atomic Op";
+    AtomicDone,             desc="AtomicOps Complete";
+    AtomicNotDone,          desc="AtomicOps not Complete";
+    Data,                   desc="data messgae";
+    // Coming from this TCC
+    L2_Repl,                desc="L2 Replacement";
+    // Probes
+    PrbInv,                 desc="Invalidating probe";
+    // Coming from Memory Controller
+    WBAck,                  desc="writethrough ack from memory";
+  }
+
+  // STATES
+  state_declaration(State, desc="TCC State", default="TCC_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+    W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+    V, AccessPermission:Read_Only,  desc="Valid";
+    I, AccessPermission:Invalid,    desc="Invalid";
+    IV, AccessPermission:Busy,      desc="Waiting for Data";
+    WI, AccessPermission:Busy,      desc="Waiting on Writethrough Ack";
+    A, AccessPermission:Busy,       desc="Invalid waiting on atomic Data";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+    WriteMask writeMask,        desc="Dirty byte mask";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+    NetDest Destination, desc="Data destination";
+    int numAtomics,     desc="number remaining atomics";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  MachineID getPeer(MachineID mach) {
+    return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+  }
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L2cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return TCC_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return TCC_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(TCC_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+
+  // ** OUT_PORTS **
+
+  // Three classes of ports
+  // Class 1: downward facing network links to NB
+  out_port(requestToNB_out, CPURequestMsg, requestToNB);
+  out_port(responseToNB_out, ResponseMsg, responseToNB);
+  out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+  // Class 2: upward facing ports to GPU cores
+  out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  //
+  // request queue going to NB
+  //
+
+
+// ** IN_PORTS **
+  in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (tbe.numAtomics == 0) {
+            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+        } else {
+            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+
+  in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+    if (responseFromNB_in.isReady(clockEdge())) {
+      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if(presentOrAvail(in_msg.addr)) {
+            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+          } else {
+            Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+            trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Finally handling incoming requests (from TCP) and probes (from NB).
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+      }
+    }
+  }
+
+
+  in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            if(WB) {
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+                } else {
+                    Addr victim :=  L2cache.cacheProbe(in_msg.addr);
+                    trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+                }
+            } else {
+                trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+  // BEGIN ACTIONS
+
+  action(i_invL2, "i", desc="invalidate TCC cache block") {
+    if (is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  // Data available at TCC. Send the DATA to TCP
+  action(sd_sendData, "sd", desc="send Shared response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  // Data was not available at TCC. So, TCC forwarded the request to
+  // directory and directory responded back with data. Now, forward the
+  // DATA to TCP and send the unblock ack back to directory.
+  action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+    enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:TDSysResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination := tbe.Destination;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+
+  action(rd_requestData, "r", desc="Miss in L2, pass on") {
+    if(tbe.Destination.count()==1){
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+          out_msg.addr := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := machineID;
+          out_msg.Destination.add(getPeer(machineID));
+          out_msg.Shared := false; // unneeded for this request
+          out_msg.MessageSize := in_msg.MessageSize;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysWBAck;
+          out_msg.Destination.clear();
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+    }
+  }
+
+  action(swb_sendWBAck, "swb", desc="send WB Ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.DataBlk := in_msg.DataBlk;
+        }
+    }
+  }
+  action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") {
+    enqueue(unblockToNB_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate TCC block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+      cache_entry.writeMask.clear();
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    if (is_invalid(tbe)) {
+      check_allocate(TBEs);
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      tbe.Destination.clear();
+      tbe.numAtomics := 0;
+    }
+    if (coreRequestNetwork_in.isReady(clockEdge())) {
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+          tbe.Destination.add(in_msg.Requestor);
+        }
+      }
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+    tbe.Destination.clear();
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(wt_writeThrough, "wt", desc="write through data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(wb_writeBack, "wb", desc="write back data") {
+    enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.WTRequestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteThrough;
+      out_msg.Dirty := true;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+    }
+  }
+
+  action(at_atomicThrough, "at", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:Atomic;
+        out_msg.Dirty := true;
+        out_msg.writeMask.orMask(in_msg.writeMask);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseToNB_out, ResponseMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L2cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    coreRequestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseFromNB_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+  action(zz_recycleRequestQueue, "z", desc="stall"){
+    coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+    tbe.numAtomics := tbe.numAtomics + 1;
+  }
+
+
+  action(dna_decrementNumAtomics, "dna", desc="dec num atomics") {
+    tbe.numAtomics := tbe.numAtomics - 1;
+    if (tbe.numAtomics==0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AtomicDone;
+      }
+    }
+  }
+
+  action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+  // transitions from base
+  // Assumptions for ArrayRead/Write
+  // TBE checked before tags
+  // Data Read/Write requires Tag Read
+
+  transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+    zz_recycleRequestQueue;
+  }
+  transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+    sd_sendData;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  transition(I, RdBlk, IV) {TagArrayRead} {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition(IV, RdBlk) {
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+  transition({V, I},Atomic, A) {TagArrayRead} {
+    i_invL2;
+    t_allocateTBE;
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition(A, Atomic) {
+    at_atomicThrough;
+    ina_incrementNumAtomics;
+    p_popRequestQueue;
+  }
+
+  transition({M, W}, Atomic, WI) {TagArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+  }
+
+  // Cahceblock stays in I state which implies
+  // this TCC is a write-no-allocate cache
+  transition(I, WrVicBlk) {TagArrayRead} {
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    wt_writeThrough;
+    p_popRequestQueue;
+  }
+
+  transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    swb_sendWBAck;
+    wdb_writeDirtyBytes;
+    p_popRequestQueue;
+  }
+
+  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+  }
+
+  transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+  }
+
+  transition({A, IV, WI}, L2_Repl) {
+    i_invL2;
+  }
+
+  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(W, PrbInv) {TagArrayRead} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition({A, IV, WI}, PrbInv) {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ut_updateTag;
+    wcb_writeCacheBlock;
+    sdr_sendDataResponse;
+    sd2rb_sendDone2RegionBuffer;
+    pr_popResponseQueue;
+    dt_deallocateTBE;
+  }
+
+  transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    a_allocateBlock;
+    ar_sendAtomicResponse;
+    sd2rb_sendDone2RegionBuffer;
+    dna_decrementNumAtomics;
+    pr_popResponseQueue;
+  }
+
+  transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+    dt_deallocateTBE;
+    ptr_popTriggerQueue;
+  }
+
+  transition(A, AtomicNotDone) {TagArrayRead} {
+    ptr_popTriggerQueue;
+  }
+
+  //M,W should not see WBAck as the cache is in WB mode
+  //WBAcks do not need to check tags
+  transition({I, V, IV, A}, WBAck) {
+    w_sendResponseWBAck;
+    sd2rb_sendDone2RegionBuffer;
+    pr_popResponseQueue;
+  }
+
+  transition(WI, WBAck,I) {
+    sd2rb_sendDone2RegionBuffer;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+}
diff --git a/src/mem/protocol/GPU_VIPER_Region.slicc b/src/mem/protocol/GPU_VIPER_Region.slicc
new file mode 100644
index 000000000..cbfef9de3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_VIPER_Region";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-Region-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-Region-dir.sm";
+include "GPU_VIPER_Region-TCC.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "MOESI_AMD_Base-RegionDir.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
new file mode 100644
index 000000000..76fe77230
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+   Sequencer * sequencer1;
+   CacheMemory * L1Icache;
+   CacheMemory * L1D0cache;
+   CacheMemory * L1D1cache;
+   CacheMemory * L2cache;     // func mem logic looks in this CacheMemory
+   bool send_evictions := "False";
+   Cycles issue_latency := 5;  // time to send data down to NB
+   Cycles l2_hit_latency := 18;
+
+  // BEGIN Core Buffers
+
+  // To the Network
+  MessageBuffer * requestFromCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCore, network="To", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="4", vnet_type="unblock";
+
+  // From the Network
+  MessageBuffer * probeToCore, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="From", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue;
+
+  MessageBuffer * triggerQueue, ordered="true";
+
+  // END Core Buffers
+
+{
+  // BEGIN STATES
+  state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+    // Base States
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+    E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+    Es, AccessPermission:Read_Write, desc="Exclusive in core";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+    M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+    M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+    // Transient States
+    I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+    I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+    I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+    I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+    I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+    IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+    IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+    IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+    IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+    F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+    ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+    MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+    MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    S_F0, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F1, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F, AccessPermission:Read_Only,   desc="Shared, filling L1";
+    O_F0, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F1, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F,  AccessPermission:Read_Only,  desc="Owned, filling L1";
+    Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+    Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+    S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+    S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+    Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F, AccessPermission:Read_Write,  desc="Es, other cluster read, filling";
+    E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+    E1_F, AccessPermission:Read_Write, desc="...";
+    E0_Es, AccessPermission:Read_Write, desc="...";
+    E1_Es, AccessPermission:Read_Write, desc="...";
+    Ms_F0, AccessPermission:Read_Write, desc="...";
+    Ms_F1, AccessPermission:Read_Write, desc="...";
+    Ms_F, AccessPermission:Read_Write,  desc="...";
+    M0_F, AccessPermission:Read_Write, desc="...";
+    M0_Ms, AccessPermission:Read_Write, desc="...";
+    M1_F, AccessPermission:Read_Write, desc="...";
+    M1_Ms, AccessPermission:Read_Write, desc="...";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+    S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+  } // END STATES
+
+  // BEGIN EVENTS
+  enumeration(Event, desc="CP Events") {
+    // CP Initiated events
+    C0_Load_L1miss,            desc="Cluster 0 load, L1 missed";
+    C0_Load_L1hit,             desc="Cluster 0 load, L1 hit";
+    C1_Load_L1miss,            desc="Cluster 1 load L1 missed";
+    C1_Load_L1hit,             desc="Cluster 1 load L1 hit";
+    Ifetch0_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch1_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch0_L1miss,            desc="Instruction fetch, missed in the L1";
+    Ifetch1_L1miss,            desc="Instruction fetch, missed in the L1";
+    C0_Store_L1miss,           desc="Cluster 0 store missed in L1";
+    C0_Store_L1hit,            desc="Cluster 0 store hit in L1";
+    C1_Store_L1miss,           desc="Cluster 1 store missed in L1";
+    C1_Store_L1hit,            desc="Cluster 1 store hit in L1";
+    // NB Initiated events
+    NB_AckS,             desc="NB Ack to Core Request";
+    NB_AckM,             desc="NB Ack to Core Request";
+    NB_AckE,             desc="NB Ack to Core Request";
+
+    NB_AckWB,            desc="NB Ack for writeback";
+
+    // Memory System initiatied events
+    L1I_Repl,           desc="Replace address from L1I"; // Presumed clean
+    L1D0_Repl,           desc="Replace address from L1D0"; // Presumed clean
+    L1D1_Repl,           desc="Replace address from L1D1"; // Presumed clean
+    L2_Repl,            desc="Replace address from L2";
+
+    L2_to_L1D0,           desc="L1 fill from L2";
+    L2_to_L1D1,           desc="L1 fill from L2";
+    L2_to_L1I,           desc="L1 fill from L2";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+
+  }  // END EVENTS
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L1D0DataArrayRead,    desc="Read the data array";
+    L1D0DataArrayWrite,   desc="Write the data array";
+    L1D0TagArrayRead,     desc="Read the data array";
+    L1D0TagArrayWrite,    desc="Write the data array";
+    L1D1DataArrayRead,    desc="Read the data array";
+    L1D1DataArrayWrite,   desc="Write the data array";
+    L1D1TagArrayRead,     desc="Read the data array";
+    L1D1TagArrayWrite,    desc="Write the data array";
+    L1IDataArrayRead,     desc="Read the data array";
+    L1IDataArrayWrite,    desc="Write the data array";
+    L1ITagArrayRead,      desc="Read the data array";
+    L1ITagArrayWrite,     desc="Write the data array";
+    L2DataArrayRead,      desc="Read the data array";
+    L2DataArrayWrite,     desc="Write the data array";
+    L2TagArrayRead,       desc="Read the data array";
+    L2TagArrayWrite,      desc="Write the data array";
+  }
+
+
+  // BEGIN STRUCTURE DEFINITIONS
+
+
+  // Cache Entry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // END STRUCTURE DEFINITIONS
+
+  // BEGIN INTERNAL FUNCTIONS
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  bool addressInCore(Addr addr) {
+    return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+  }
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    return L2cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+    if (cluster == 0) {
+      Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+      return L1D0_entry;
+    } else {
+      Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+      return L1D1_entry;
+    }
+  }
+
+  Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+    return c_entry;
+  }
+
+  bool presentOrAvail2(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailI(Addr addr) {
+    return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD0(Addr addr) {
+    return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD1(Addr addr) {
+    return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return CorePair_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return CorePair_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(CorePair_State_to_permission(state));
+    }
+  }
+
+  MachineType testAndClearLocalHit(Entry cache_entry) {
+    assert(is_valid(cache_entry));
+    if (cache_entry.FromL2) {
+      cache_entry.FromL2 := false;
+      return MachineType:L2Cache;
+    } else {
+      return MachineType:L1Cache;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L1D0DataArrayRead) {
+        L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+        L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+        L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+        L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+        L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+        L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+        L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+        L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+        L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+        L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+        L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+        L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L2DataArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L2DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+
+    } else {
+      return true;
+    }
+  }
+
+  // END INTERNAL FUNCTIONS
+
+  // ** OUT_PORTS **
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // ** IN_PORTS **
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == TriggerType:L2_to_L1) {
+          if (in_msg.Dest == CacheId:L1I) {
+            trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D0) {
+            trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D1) {
+            trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("unexpected trigger dest");
+          }
+        }
+      }
+    }
+  }
+
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          assert(in_msg.ReturnData);
+          trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+
+  // ResponseNetwork
+  in_port(responseToCore_in, ResponseMsg, responseToCore) {
+    if (responseToCore_in.isReady(clockEdge())) {
+      peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Nothing from the Unblock Network
+
+  // Mandatory Queue
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        if (in_msg.Type == RubyRequestType:IFETCH) {
+          // FETCH ACCESS
+
+          if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+            if (mod(in_msg.contextId, 2) == 0) {
+              trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+              trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            if (presentOrAvail2(in_msg.LineAddress)) {
+              if (presentOrAvailI(in_msg.LineAddress)) {
+                if (mod(in_msg.contextId, 2) == 0) {
+                  trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                }
+              } else {
+                Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L1I_Repl, victim,
+                        getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            } else { // Not present or avail in L2
+              Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+            }
+          }
+        } else {
+          // DATA ACCESS
+          if (mod(in_msg.contextId, 2) == 1) {
+            if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+                        tbe);
+              } else {
+                // Stores must write through, make sure L2 avail.
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD1(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  } else {
+                    trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L1D1_Repl, victim,
+                          getCacheEntry(victim), TBEs.lookup(victim));
+                }
+              } else { // not present or avail in L2
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            }
+          } else {
+            Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+            if (is_valid(L1D0cache_entry)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+                    tbe);
+              } else {
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+                      tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD0(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+                        cache_entry, tbe);
+                  } else {
+                    trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+                  trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              } else {
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                        TBEs.lookup(victim));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  // ACTIONS
+  action(ii_invIcache, "ii", desc="invalidate iCache") {
+    if (L1Icache.isTagPresent(address)) {
+      L1Icache.deallocate(address);
+    }
+  }
+
+  action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+  }
+
+  action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(i2_invL2, "i2", desc="invalidate L2") {
+    if(is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(mru_setMRU, "mru", desc="Update LRU state") {
+    L2cache.setMRU(address);
+  }
+
+  action(mruD1_setD1cacheMRU, "mruD1", desc="Update LRU state") {
+    L1D1cache.setMRU(address);
+  }
+
+  action(mruD0_setD0cacheMRU, "mruD0", desc="Update LRU state") {
+    L1D0cache.setMRU(address);
+  }
+
+  action(mruI_setIcacheMRU, "mruI", desc="Update LRU state") {
+    L1Icache.setMRU(address);
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      DPRINTF(RubySlicc,"%s\n",out_msg.Destination);
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+    if (L1D0cache.isTagPresent(address) == false) {
+      L1D0cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+    if (L1D1cache.isTagPresent(address) == false) {
+      L1D1cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+    if (L1Icache.isTagPresent(address) == false) {
+      L1Icache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToCore_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", address, l2entry.DataBlk);
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+   peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer.writeCallback(address,
+                            cache_entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.Dirty := true;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer1.writeCallback(address,
+                             cache_entry.DataBlk,
+                             true,
+                             testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer.writeCallback(address,
+                              cache_entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer1.writeCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer1.evictionCallback(address);
+    }
+  }
+
+  action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+    Entry entry := getICacheEntry(address);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1I;
+    }
+  }
+
+  action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D0;
+    }
+  }
+
+  action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D1;
+    }
+  }
+
+  action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getICacheEntry(address);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      DPRINTF(ProtocolTrace, "CP writeD0: address %s, data: %s\n", address, in_msg.DataBlk);
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      assert(addressInCore(address) || is_valid(tbe));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      if (addressInCore(address)) {
+        out_msg.Hit := true;
+      } else {
+        out_msg.Hit := false;
+      }
+      out_msg.Dirty := false;  // not sending back data, so def. not dirty
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+    ++L2cache.demand_misses;
+  }
+
+  action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+    ++L1D0cache.demand_misses;
+  }
+
+  action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+    ++L1D1cache.demand_misses;
+  }
+
+  action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+    ++L1Icache.demand_misses;
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(xx_recycleResponseQueue, "xx", desc="recycle response queue") {
+    responseToCore_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+  transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead,L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mruD0_setD0cacheMRU;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mruD1_setD1cacheMRU;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+    a0_allocateL1D;
+    l10m_profileMiss;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L1ITagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+  transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;   // instantaneous L1/L2 dirty - no writethrough delay
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Load_L1miss, E0_Es) {L1D1TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Store_L1miss, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead,  L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue CtoD
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mruD0_setD0cacheMRU;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+     l11m_profileMiss;
+    a1_allocateL1D;
+    mruD1_setD1cacheMRU;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;  // permissions miss
+    l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } {
+    l2m_profileMiss;  // permissions miss
+     l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D0TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead,L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+    a0_allocateL1D;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    a1_allocateL1D;
+    f1_L2ToL1;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    mruD0_setD0cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    mruD1_setD1cacheMRU;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  // end transitions from base
+
+  // Begin simple hit transitions
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+          Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+    // track hits, if implemented
+    l0_loadDone;
+    mruD0_setD0cacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+          Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+    // track hits, if implemented
+    l1_loadDone;
+    mruD1_setD1cacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+    // track hits, if implemented
+    il0_loadDone;
+    mruI_setIcacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+    // track hits, if implemented
+    il1_loadDone;
+    mruI_setIcacheMRU;
+    p_popMandatoryQueue;
+  }
+
+  // end simple hit transitions
+
+  // Transitions from transient states
+
+  // recycles
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+          O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+          O_M0, S0, S1, I_C, S0_C, S1_C, S_C},  C1_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+          IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+          O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+          E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+          S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+          Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+          M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+          Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+          M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+          O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+          M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+          PrbInvData, PrbInv, PrbShrData}) {} {
+    yy_recycleProbeQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+    xx_recycleResponseQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Load_L1miss, I_M0Ms) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Load_L1miss, I_M1Ms) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Store_L1miss, I_M0M1) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Store_L1miss, I_M1M0) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    mru_setMRU;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F0, C1_Load_L1miss, S_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) { L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+    i0_invCluster;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+    i1_invCluster;
+  }
+
+  transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+    ii_invIcache;
+  }
+
+  transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vc_victim;
+    ib_invBothClusters;
+    i2_invL2;
+    ii_invIcache;
+  }
+
+  transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vd_victim;
+    i2_invL2;
+    ib_invBothClusters;  // nothing will happen for D0 on M1, vice versa
+  }
+
+  transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckS, S_C) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckS, S_C) {L1D1DataArrayWrite, L2DataArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  // THESE MO->M1 should not be instantaneous but oh well for now.
+  transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    i0_invCluster;
+    s1_storeDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    i1_invCluster;
+    s0_storeDone;
+    pr_popResponseQueue;
+  }
+
+  // Above shoudl be more like this, which has some latency to xfer to L1
+  transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    uu_sendUnblock;
+    f1_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    uu_sendUnblock;
+    f0_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite,  L2DataArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    mru_setMRU;
+    il0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    mru_setMRU;
+    il1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    mru_setMRU;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    mru_setMRU;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S0, L2_to_L1I, S0) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S1, L2_to_L1I, S1) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    mru_setMRU;
+    xs0_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    mru_setMRU;
+    xs1_storeDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE; // FOO
+    nS_issueRdBlkS;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE; // FOO
+    nS_issueRdBlkS;
+    pr_popResponseQueue;
+  }
+
+  // Writeback cancel "ack"
+  transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    pr_popResponseQueue;
+  }
+
+  // Begin Probe Transitions
+
+  transition({Ms, M0, M1, O}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    i2_invL2;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S, I}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;  // only relevant for S
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInvData, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2; // nothing will happen in I
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O}, PrbShrData, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S}, PrbShrData, S) {L2TagArrayRead, L2TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, PrbShrData) {L2TagArrayRead} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S}, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;  // must invalidate current data (only relevant for I_M0)
+    a0_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M1, I_E1S}, {PrbInv, PrbInvData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+    a1_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbShrData}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S, I_M1, I_E1S}, PrbShrData) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInvData, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbShrData, ES_I) {} {
+    ph_sendProbeResponseHit;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData, MO_I) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInvData, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInvData, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    d_deallocateTBE;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({MO_S0, MO_S1}, PrbShrData) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInv}, IF_E0S) {}{
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInv}, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F, Es_F}, {PrbInvData, PrbInv}, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F0, {PrbInvData, PrbInv}, F_S0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F1, {PrbInvData, PrbInv}, F_S1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F0, E0_F, E1_Es}, PrbShrData, S_F0) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F1, E1_F, E0_Es}, PrbShrData, S_F1) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(Es_F, PrbShrData, S_F) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M0, PrbInvData, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M0, PrbInvData, I_M0) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M1, PrbInvData, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M1, PrbInvData, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S0_C}, {PrbInvData, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S1, S1_C}, {PrbInvData, PrbInv}) {}  {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, S_M1}, PrbShrData) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({O_M0, O_M1}, PrbShrData) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S1, S0_C, S1_C}, PrbShrData) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInvData, IF_E0S) { L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInvData, IF_E1S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInvData, IF_ES) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms}, PrbShrData, O_F0) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms}, PrbShrData, O_F1) {} {
+  }
+
+  transition({Ms_F}, PrbShrData, O_F) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({O_F0, O_F1, O_F}, PrbShrData) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-L3cache.sm b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
new file mode 100644
index 000000000..479cf4e78
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
@@ -0,0 +1,1130 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:L3Cache, "L3")
+ : CacheMemory * L3cache;
+   WireBuffer * reqToDir;
+   WireBuffer * respToDir;
+   WireBuffer * l3UnblockToDir;
+   WireBuffer * reqToL3;
+   WireBuffer * probeToL3;
+   WireBuffer * respToL3;
+   Cycles l3_request_latency := 1;
+   Cycles l3_response_latency := 35;
+
+  // To the general response network
+  MessageBuffer * responseFromL3, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+  // From the general response network
+  MessageBuffer * responseToL3, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+{
+  // EVENTS
+  enumeration(Event, desc="L3 Events") {
+    // Requests coming from the Cores
+    RdBlk,                  desc="CPU RdBlk event";
+    RdBlkM,                 desc="CPU RdBlkM event";
+    RdBlkS,                 desc="CPU RdBlkS event";
+    CtoD,                   desc="Change to Dirty request";
+    WrVicBlk,               desc="L2 Victim (dirty)";
+    WrVicBlkShared,               desc="L2 Victim (dirty)";
+    ClVicBlk,               desc="L2 Victim (clean)";
+    ClVicBlkShared,               desc="L2 Victim (clean)";
+
+    CPUData,                      desc="WB data from CPU";
+    CPUDataShared,                desc="WB data from CPU, NBReqShared 1";
+    StaleWB,                desc="WB stale; no data";
+
+    L3_Repl,             desc="L3 Replacement";
+
+    // Probes
+    PrbInvData,         desc="Invalidating probe, return dirty data";
+    PrbInv,             desc="Invalidating probe, no need to return data";
+    PrbShrData,         desc="Downgrading probe, return data";
+
+    // Coming from Memory Controller
+    WBAck,                     desc="ack from memory";
+
+    CancelWB,                   desc="Cancel WB from L2";
+  }
+
+  // STATES
+  // Base States:
+  state_declaration(State, desc="L3 State", default="L3Cache_State_I") {
+    M, AccessPermission:Read_Write, desc="Modified";  // No other cache has copy, memory stale
+    O, AccessPermission:Read_Only, desc="Owned";     // Correct most recent copy, others may exist in S
+    E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+    S, AccessPermission:Read_Only, desc="Shared";    // Correct, most recent. If no one in O, then == Memory
+    I, AccessPermission:Invalid, desc="Invalid";
+
+    I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+    I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+    S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+    S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+    S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+    E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+    E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+    E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+    O_M, AccessPermission:Busy, desc="...";
+    O_O, AccessPermission:Busy, desc="...";
+    O_E, AccessPermission:Busy, desc="...";
+    O_S, AccessPermission:Busy, desc="...";
+    M_M, AccessPermission:Busy, desc="...";
+    M_O, AccessPermission:Busy, desc="...";
+    M_E, AccessPermission:Busy, desc="...";
+    M_S, AccessPermission:Busy, desc="...";
+    D_I, AccessPermission:Invalid,  desc="drop WB data on the floor when receive";
+    MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+    MO_I, AccessPermission:Busy, desc="M or O, received L3_Repl, waiting for WBAck from Mem";
+    I_I, AccessPermission:Busy, desc="I_MO received L3_Repl";
+    I_CD, AccessPermission:Busy, desc="I_I received WBAck, now just waiting for CPUData";
+    I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  // STRUCTURES
+
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff from memory?)";
+    DataBlock DataBlk,          desc="Data for the block";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    bool Shared,        desc="Victim hit by shared probe";
+    MachineID From,     desc="Waiting for writeback from...";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<L3Cache_TBE>", constructor="m_number_of_TBEs";
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+
+
+  // FUNCTION DEFINITIONS
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", L3cache.lookup(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(addr).DataBlk;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return L3cache.isTagPresent(addr) || L3cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+        cache_entry.CacheState := state;
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return L3Cache_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return L3Cache_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(L3Cache_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    return true;
+  }
+
+
+  // OUT PORTS
+  out_port(requestNetwork_out, CPURequestMsg, reqToDir);
+  out_port(L3Resp_out, ResponseMsg, respToDir);
+  out_port(responseNetwork_out, ResponseMsg, responseFromL3);
+  out_port(unblockNetwork_out, UnblockMsg, l3UnblockToDir);
+
+  // IN PORTS
+  in_port(NBResponse_in, ResponseMsg, respToL3) {
+    if (NBResponse_in.isReady(clockEdge())) {
+      peek(NBResponse_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on NBResponse Type");
+        }
+      }
+    }
+  }
+
+  // Response Network
+  in_port(responseNetwork_in, ResponseMsg, responseToL3) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUData) {
+          if (in_msg.NbReqShared) {
+            trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+          error("Error on NBResponse Type");
+        }
+      }
+    }
+  }
+
+  // probe network
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToL3) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.ReturnData) {
+            trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("Don't think I should get any of these");
+          }
+        }
+      }
+    }
+  }
+
+  // Request Network
+  in_port(requestNetwork_in, CPURequestMsg, reqToL3) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        assert(in_msg.Destination.isElement(machineID));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim :=  L3cache.cacheProbe(in_msg.addr);
+            trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (presentOrAvail(in_msg.addr)) {
+            if (in_msg.Shared) {
+              trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+            }
+          } else {
+            Addr victim := L3cache.cacheProbe(in_msg.addr);
+            trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+          }
+        } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+          if (is_valid(tbe) && tbe.From == in_msg.Requestor) {
+            trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+          } else {
+            requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+          }
+        }
+      }
+    }
+  }
+
+  // BEGIN ACTIONS
+
+  action(i_invL3, "i", desc="invalidate L3 cache block") {
+    if (is_valid(cache_entry)) {
+        L3cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(rm_sendResponseM, "rm", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(rs_sendResponseS, "rs", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.State := CoherenceState:Shared;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+
+  action(r_requestToMem, "r", desc="Miss in L3, pass on") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.Shared := false; // unneeded for this request
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    if (is_valid(cache_entry)) {
+      tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+      tbe.Dirty := cache_entry.Dirty;
+    }
+    tbe.From := machineID;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(vd_vicDirty, "vd", desc="Victimize dirty L3 data") {
+    enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.State := CoherenceState:NA;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+    enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:WrCancel;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(a_allocateBlock, "a", desc="allocate L3 block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L3cache.allocate(address, new Entry));
+    }
+  }
+
+  action(d_writeData, "d", desc="write data to L3") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        cache_entry.Dirty := in_msg.Dirty;
+      }
+      cache_entry.DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing to L3: %s\n", in_msg);
+    }
+  }
+
+  action(rd_copyDataFromRequest, "rd", desc="write data to L3") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := true;
+    }
+  }
+
+  action(f_setFrom, "f", desc="set who WB is expected to come from") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.From := in_msg.Requestor;
+    }
+  }
+
+  action(rf_resetFrom, "rf", desc="reset From") {
+    tbe.From := machineID;
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUData;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Shared) {
+        out_msg.NbReqShared := true;
+      } else {
+        out_msg.NbReqShared := false;
+      }
+      out_msg.State := CoherenceState:Shared; // faux info
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      tbe.DataBlk := in_msg.DataBlk;
+      tbe.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, l3_request_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+    L3cache.setMRU(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popNBResponseQueue, "pn", desc="pop NB response queue") {
+    NBResponse_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+
+  transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {TagArrayRead} {
+    r_requestToMem;
+    p_popRequestQueue;
+  }
+
+  transition(O, RdBlk ) {TagArrayRead, DataArrayRead} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(M, RdBlk, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition(S, RdBlk) {TagArrayRead, DataArrayRead} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+  transition(E, RdBlk, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition({M, O}, RdBlkS, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition({E, S}, RdBlkS, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    rs_sendResponseS;
+    ut_updateTag;
+    p_popRequestQueue;
+  }
+
+  transition(M, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    rm_sendResponseM;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({O, S}, {RdBlkM, CtoD}) {TagArrayRead} {
+    r_requestToMem;  // can't handle this, just forward
+    p_popRequestQueue;
+  }
+
+  transition(E, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    rm_sendResponseM;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({I}, WrVicBlk, I_M) {TagArrayRead, TagArrayWrite} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I}, WrVicBlkShared, I_O) {TagArrayRead, TagArrayWrite} {
+    a_allocateBlock;
+    t_allocateTBE;
+    f_setFrom;
+//    rd_copyDataFromRequest;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, WrVicBlkShared, S_O) {TagArrayRead, TagArrayWrite} {
+//    rd_copyDataFromRequest;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, WrVicBlk, S_M) {TagArrayRead, TagArrayWrite} { // should be technically not possible, but assume the data comes back with shared bit flipped
+//    rd_copyDataFromRequest;
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, WrVicBlk, E_M) {TagArrayRead, TagArrayWrite}  {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, WrVicBlkShared, E_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, WrVicBlk, O_M) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, WrVicBlkShared, O_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, WrVicBlk, M_M) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, WrVicBlkShared, M_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlk, I_E) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({I}, ClVicBlkShared, I_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    a_allocateBlock;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, ClVicBlk, S_E) {TagArrayRead, TagArrayWrite} { // technically impossible, assume data comes back with shared bit flipped
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(S, ClVicBlkShared, S_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, ClVicBlk, E_E) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(E, ClVicBlkShared, E_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, ClVicBlk, O_E) {TagArrayRead, TagArrayWrite} { // technically impossible, but assume data comes back with shared bit flipped
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(O, ClVicBlkShared, O_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, ClVicBlk, M_E) {TagArrayRead, TagArrayWrite}  {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(M, ClVicBlkShared, M_S) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {} {
+    r_requestToMem;
+    p_popRequestQueue;
+  }
+
+  transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {TagArrayWrite} {
+    f_setFrom;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(I_M, CPUData, M) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUData, E) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(I_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    pr_popResponseQueue;
+  }
+
+  transition(S_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(S_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_E, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition(O_S, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    d_writeData;
+    ut_updateTag;  // update tag on writeback hits.
+    pr_popResponseQueue;
+  }
+
+  transition({D_I}, {CPUData, CPUDataShared}, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition(I_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite, DataArrayRead} {
+    uu_sendUnblock;
+    wt_writeDataToTBE;
+    rf_resetFrom;
+    pr_popResponseQueue;
+  }
+
+  transition(I_CD, {CPUData, CPUDataShared}, I) {DataArrayRead, TagArrayWrite} {
+    uu_sendUnblock;
+    wt_writeDataToTBE;
+    wb_data;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition({M, O}, L3_Repl, MO_I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    vd_vicDirty;
+    i_invL3;
+  }
+
+  transition({E, S,}, L3_Repl, I) {TagArrayRead, TagArrayWrite} {
+    i_invL3;
+  }
+
+  transition({I_M, I_O, S_M, S_O, E_M, E_O}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, O_S, M_M, M_O, M_E, M_S}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({I_E, I_S, S_E, S_S, E_E, E_S}, L3_Repl) {} {
+    zz_recycleRequestQueue;
+  }
+
+  transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    pd_sendProbeResponseData;
+    i_invL3;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    i_invL3;  // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    i_invL3; // nothing will happen in I
+    pp_popProbeQueue;
+  }
+
+  transition({M, O}, PrbShrData, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({E, S}, PrbShrData, S) {TagArrayRead, TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(I, PrbShrData) {TagArrayRead} {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInvData, I_C) {TagArrayWrite, DataArrayRead} {
+    pdt_sendProbeResponseDataFromTBE;
+    mc_cancelMemWriteback;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {TagArrayWrite} {
+    pi_sendProbeResponseInv;
+    mc_cancelMemWriteback;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbShrData) {DataArrayRead} {
+    pdt_sendProbeResponseDataFromTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInv}) {} {
+    pi_sendProbeResponseInv;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbShrData) {} {
+    pm_sendProbeResponseMiss;
+    pp_popProbeQueue;
+  }
+
+  transition(I_I, {WBAck}, I_CD) {TagArrayWrite} {
+    pn_popNBResponseQueue;
+  }
+
+  transition(MOD_I, WBAck, D_I) {DataArrayRead} {
+    wb_data;
+    pn_popNBResponseQueue;
+  }
+
+  transition(MO_I, WBAck, I) {DataArrayRead, TagArrayWrite} {
+    wb_data;
+    dt_deallocateTBE;
+    pn_popNBResponseQueue;
+  }
+
+  transition(I_C, {WBAck}, I) {TagArrayWrite} {
+    dt_deallocateTBE;
+    pn_popNBResponseQueue;
+  }
+
+  transition({I_M, I_O, I_E, I_S}, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    i_invL3;
+    p_popRequestQueue;
+  }
+
+  transition({S_S, S_O, S_M, S_E}, CancelWB, S) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({E_M, E_O, E_E, E_S}, CancelWB, E) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({O_M, O_O, O_E, O_S}, CancelWB, O) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({M_M, M_O, M_E, M_S}, CancelWB, M) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(D_I, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(MOD_I, CancelWB, MO_I) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    p_popRequestQueue;
+  }
+
+  transition(I_I, CancelWB, I_C) {TagArrayWrite} {
+    uu_sendUnblock;
+    rf_resetFrom;
+    mc_cancelMemWriteback;
+    p_popRequestQueue;
+  }
+
+  transition(I_CD, CancelWB, I) {TagArrayWrite} {
+    uu_sendUnblock;
+    dt_deallocateTBE;
+    mc_cancelMemWriteback;
+    p_popRequestQueue;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
new file mode 100644
index 000000000..fd84447a2
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
@@ -0,0 +1,3009 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+   Sequencer * sequencer1;
+   CacheMemory * L1Icache;
+   CacheMemory * L1D0cache;
+   CacheMemory * L1D1cache;
+   CacheMemory * L2cache;
+   int regionBufferNum;
+   bool send_evictions := "False";
+   Cycles issue_latency := 5;
+   Cycles l2_hit_latency := 18;
+
+  // BEGIN Core Buffers
+
+  // To the Network
+  MessageBuffer * requestFromCore, network="To", virtual_network="0", ordered="true", vnet_type="request";
+  MessageBuffer * responseFromCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCore, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  // From the Network
+  MessageBuffer * probeToCore, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToCore, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+  MessageBuffer * mandatoryQueue, ordered="false";
+  MessageBuffer * triggerQueue, ordered="true";
+
+  // END Core Buffers
+
+{
+  // BEGIN STATES
+  state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+    I, AccessPermission:Invalid, desc="Invalid";
+    S, AccessPermission:Read_Only, desc="Shared";
+    E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+    E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+    Es, AccessPermission:Read_Write, desc="Exclusive in core";
+    O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+    Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+    M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+    M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+    // Transient States
+    I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+    I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+    I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+    I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+    I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+    I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+    I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+    IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+    IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+    IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+    IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+    F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+    F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+    ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+    MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+    MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+    S_F0, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F1, AccessPermission:Read_Only,  desc="Shared, filling L1";
+    S_F, AccessPermission:Read_Only,   desc="Shared, filling L1";
+    O_F0, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F1, AccessPermission:Read_Only,  desc="Owned, filling L1";
+    O_F,  AccessPermission:Read_Only,  desc="Owned, filling L1";
+    Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+    Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+    S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+    S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+    S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+    Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+    Es_F, AccessPermission:Read_Write,  desc="Es, other cluster read, filling";
+    E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+    E1_F, AccessPermission:Read_Write, desc="...";
+    E0_Es, AccessPermission:Read_Write, desc="...";
+    E1_Es, AccessPermission:Read_Write, desc="...";
+    Ms_F0, AccessPermission:Read_Write, desc="...";
+    Ms_F1, AccessPermission:Read_Write, desc="...";
+    Ms_F, AccessPermission:Read_Write,  desc="...";
+    M0_F, AccessPermission:Read_Write, desc="...";
+    M0_Ms, AccessPermission:Read_Write, desc="...";
+    M1_F, AccessPermission:Read_Write, desc="...";
+    M1_Ms, AccessPermission:Read_Write, desc="...";
+
+    I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+    S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+    S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+  } // END STATES
+
+  // BEGIN EVENTS
+  enumeration(Event, desc="CP Events") {
+    // CP Initiated events
+    C0_Load_L1miss,            desc="Cluster 0 load, L1 missed";
+    C0_Load_L1hit,             desc="Cluster 0 load, L1 hit";
+    C1_Load_L1miss,            desc="Cluster 1 load L1 missed";
+    C1_Load_L1hit,             desc="Cluster 1 load L1 hit";
+    Ifetch0_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch1_L1hit,             desc="Instruction fetch, hit in the L1";
+    Ifetch0_L1miss,            desc="Instruction fetch, missed in the L1";
+    Ifetch1_L1miss,            desc="Instruction fetch, missed in the L1";
+    C0_Store_L1miss,           desc="Cluster 0 store missed in L1";
+    C0_Store_L1hit,            desc="Cluster 0 store hit in L1";
+    C1_Store_L1miss,           desc="Cluster 1 store missed in L1";
+    C1_Store_L1hit,            desc="Cluster 1 store hit in L1";
+    // NB Initiated events
+    NB_AckS,             desc="NB Ack to Core Request";
+    NB_AckM,             desc="NB Ack to Core Request";
+    NB_AckE,             desc="NB Ack to Core Request";
+
+    NB_AckWB,            desc="NB Ack for writeback";
+
+    // Memory System initiatied events
+    L1I_Repl,           desc="Replace address from L1I"; // Presumed clean
+    L1D0_Repl,           desc="Replace address from L1D0"; // Presumed clean
+    L1D1_Repl,           desc="Replace address from L1D1"; // Presumed clean
+    L2_Repl,            desc="Replace address from L2";
+
+    L2_to_L1D0,           desc="L1 fill from L2";
+    L2_to_L1D1,           desc="L1 fill from L2";
+    L2_to_L1I,           desc="L1 fill from L2";
+
+    // Probe Events
+    PrbInvData,         desc="probe, return O or M data";
+    PrbInvDataDemand,     desc="probe, return O or M data. Demand request";
+    PrbInv,             desc="probe, no need for data";
+    PrbShrData,         desc="probe downgrade, return O or M data";
+    PrbShrDataDemand,     desc="probe downgrade, return O or M data. Demand request";
+    ForceRepl,          desc="probe from r-buf. Act as though a repl";
+    ForceDowngrade,     desc="probe from r-buf. Act as though a repl";
+
+  }  // END EVENTS
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L1D0DataArrayRead,    desc="Read the data array";
+    L1D0DataArrayWrite,   desc="Write the data array";
+    L1D0TagArrayRead,     desc="Read the data array";
+    L1D0TagArrayWrite,    desc="Write the data array";
+    L1D1DataArrayRead,    desc="Read the data array";
+    L1D1DataArrayWrite,   desc="Write the data array";
+    L1D1TagArrayRead,     desc="Read the data array";
+    L1D1TagArrayWrite,    desc="Write the data array";
+    L1IDataArrayRead,     desc="Read the data array";
+    L1IDataArrayWrite,    desc="Write the data array";
+    L1ITagArrayRead,      desc="Read the data array";
+    L1ITagArrayWrite,     desc="Write the data array";
+    L2DataArrayRead,      desc="Read the data array";
+    L2DataArrayWrite,     desc="Write the data array";
+    L2TagArrayRead,       desc="Read the data array";
+    L2TagArrayWrite,      desc="Write the data array";
+  }
+
+
+  // BEGIN STRUCTURE DEFINITIONS
+
+
+  // Cache Entry
+  structure(Entry, desc="...", interface="AbstractCacheEntry") {
+    State CacheState,           desc="cache state";
+    bool Dirty,                 desc="Is the data dirty (diff than memory)?";
+    DataBlock DataBlk,          desc="data for the block";
+    bool FromL2, default="false", desc="block just moved from L2";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,             desc="Transient state";
+    DataBlock DataBlk,       desc="data for the block, required for concurrent writebacks";
+    bool Dirty,              desc="Is the data dirty (different than memory)?";
+    int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
+    bool Shared,             desc="Victim hit by shared probe";
+    bool AckNeeded,          desc="True if need to ack r-dir";
+   }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  // END STRUCTURE DEFINITIONS
+
+  // BEGIN INTERNAL FUNCTIONS
+
+  MachineID getPeer(MachineID mach) {
+    return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+  }
+
+  bool addressInCore(Addr addr) {
+    return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+  }
+
+  Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+    Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+    return L2cache_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return tbe.DataBlk;
+    } else {
+      return getCacheEntry(addr).DataBlk;
+    }
+  }
+
+  Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+    if (cluster == 0) {
+      Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+      return L1D0_entry;
+    } else {
+      Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+      return L1D1_entry;
+    }
+  }
+
+  Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+    Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+    return c_entry;
+  }
+
+  bool presentOrAvail2(Addr addr) {
+    return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailI(Addr addr) {
+    return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD0(Addr addr) {
+    return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+  }
+
+  bool presentOrAvailD1(Addr addr) {
+    return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if(is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.CacheState;
+    }
+    return State:I;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+      tbe.TBEState := state;
+    }
+
+    if (is_valid(cache_entry)) {
+      cache_entry.CacheState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      return CorePair_State_to_permission(tbe.TBEState);
+    }
+
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return CorePair_State_to_permission(cache_entry.CacheState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  bool isValid(Addr addr) {
+      AccessPermission perm := getAccessPermission(addr);
+      if (perm == AccessPermission:NotPresent ||
+          perm == AccessPermission:Invalid ||
+          perm == AccessPermission:Busy) {
+          return false;
+      } else {
+          return true;
+      }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(CorePair_State_to_permission(state));
+    }
+  }
+
+  MachineType testAndClearLocalHit(Entry cache_entry) {
+    assert(is_valid(cache_entry));
+    if (cache_entry.FromL2) {
+      cache_entry.FromL2 := false;
+      return MachineType:L2Cache;
+    } else {
+      return MachineType:L1Cache;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L1D0DataArrayRead) {
+      L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D0DataArrayWrite) {
+      L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:L2DataArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L2DataArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2DataArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L2TagArrayRead) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L2TagArrayWrite) {
+      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if  (request_type == RequestType:L1D0DataArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayRead) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D0TagArrayWrite) {
+      return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1DataArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayRead) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1D1TagArrayWrite) {
+      return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1IDataArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayRead) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L1ITagArrayWrite) {
+      return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      return true;
+    }
+  }
+
+  // END INTERNAL FUNCTIONS
+
+  // ** OUT_PORTS **
+
+  out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+  out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+  // ** IN_PORTS **
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == TriggerType:L2_to_L1) {
+          if (in_msg.Dest == CacheId:L1I) {
+            trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D0) {
+            trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Dest == CacheId:L1D1) {
+            trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+          } else {
+            error("unexpected trigger dest");
+          }
+        }
+      }
+    }
+  }
+
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          if (in_msg.DemandRequest) {
+            trigger(Event:PrbInvDataDemand, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.ReturnData) {
+            trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          if (in_msg.DemandRequest) {
+               trigger(Event:PrbShrDataDemand, in_msg.addr, cache_entry, tbe);
+          } else {
+               assert(in_msg.ReturnData);
+               trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == ProbeRequestType:PrbRepl) {
+          trigger(Event:ForceRepl, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == ProbeRequestType:PrbRegDowngrade) {
+          trigger(Event:ForceDowngrade, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown probe request");
+        }
+      }
+    }
+  }
+
+
+  // ResponseNetwork
+  in_port(responseToCore_in, ResponseMsg, responseToCore) {
+    if (responseToCore_in.isReady(clockEdge())) {
+      peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+
+        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+          if (in_msg.State == CoherenceState:Modified) {
+              trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Shared) {
+            trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.State == CoherenceState:Exclusive) {
+            trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+          trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
+      }
+    }
+  }
+
+  // Nothing from the Unblock Network
+
+  // Mandatory Queue
+  in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+        TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+        if (in_msg.Type == RubyRequestType:IFETCH) {
+          // FETCH ACCESS
+
+          if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+            if (mod(in_msg.contextId, 2) == 0) {
+              trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            } else {
+              trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+            }
+          } else {
+            if (presentOrAvail2(in_msg.LineAddress)) {
+              if (presentOrAvailI(in_msg.LineAddress)) {
+                if (mod(in_msg.contextId, 2) == 0) {
+                  trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+                          tbe);
+                }
+              } else {
+                Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+                trigger(Event:L1I_Repl, victim,
+                        getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            } else { // Not present or avail in L2
+              Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(0) is %s\n", in_msg.LineAddress, victim);
+              trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+            }
+          }
+        } else {
+          // DATA ACCESS
+          if (mod(in_msg.contextId, 2) == 1) {
+            if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+                        tbe);
+              } else {
+                // Stores must write through, make sure L2 avail.
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+                          tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(1) is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD1(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  } else {
+                    trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L1D1_Repl is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L1D1_Repl, victim,
+                          getCacheEntry(victim), TBEs.lookup(victim));
+                }
+              } else { // not present or avail in L2
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(2) is %s\n", in_msg.LineAddress, victim);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+              }
+            }
+          } else {
+            Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+            if (is_valid(L1D0cache_entry)) {
+              if (in_msg.Type == RubyRequestType:LD) {
+                trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+                    tbe);
+              } else {
+                if (presentOrAvail2(in_msg.LineAddress)) {
+                  trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+                      tbe);
+                } else {
+                  Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(3) is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                      TBEs.lookup(victim));
+                }
+              }
+            } else {
+              if (presentOrAvail2(in_msg.LineAddress)) {
+                if (presentOrAvailD0(in_msg.LineAddress)) {
+                  if (in_msg.Type == RubyRequestType:LD) {
+                    trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+                        cache_entry, tbe);
+                  } else {
+                    trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+                            cache_entry, tbe);
+                  }
+                } else {
+                  Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L1D0_Repl is %s\n", in_msg.LineAddress, victim);
+                  trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+                          TBEs.lookup(victim));
+                }
+              } else {
+                Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+              DPRINTF(RubySlicc, "Victim for %s L2_Repl(4) is %s\n", in_msg.LineAddress, victim);
+                trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+                        TBEs.lookup(victim));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  // ACTIONS
+  action(ii_invIcache, "ii", desc="invalidate iCache") {
+    if (L1Icache.isTagPresent(address)) {
+      L1Icache.deallocate(address);
+    }
+  }
+
+  action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+  }
+
+  action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+    if (L1D0cache.isTagPresent(address)) {
+      L1D0cache.deallocate(address);
+    }
+    if (L1D1cache.isTagPresent(address)) {
+      L1D1cache.deallocate(address);
+    }
+  }
+
+  action(i2_invL2, "i2", desc="invalidate L2") {
+    if(is_valid(cache_entry)) {
+        L2cache.deallocate(address);
+    }
+    unset_cache_entry();
+  }
+
+  action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlk;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nMs_issueRdBlkMSinked, "nMs", desc="Issue RdBlkM with CtoDSinked") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkM;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.CtoDSinked := true;
+    }
+  }
+
+  action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+    }
+  }
+
+  action(nSs_issueRdBlkSSinked, "nSs", desc="Issue RdBlkS with CtoDSinked") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:RdBlkS;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.CtoDSinked := true;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+    }
+  }
+
+  action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+    }
+  }
+
+  // Could send these two directly to dir if we made a new out network on channel 0
+  action(vdf_victimForce, "vdf", desc="Victimize M/O L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicDirty;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:O) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Private := true;
+    }
+  }
+
+  action(vcf_victimForce, "vcf", desc="Victimize E/S L2 Data") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.Type := CoherenceRequestType:VicClean;
+      out_msg.InitialRequestTime := curCycle();
+      if (cache_entry.CacheState == State:S) {
+        out_msg.Shared := true;
+      } else {
+        out_msg.Shared := false;
+      }
+      out_msg.Private := true;
+    }
+  }
+
+  action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+    if (L1D0cache.isTagPresent(address) == false) {
+      L1D0cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+    if (L1D1cache.isTagPresent(address) == false) {
+      L1D1cache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+    if (L1Icache.isTagPresent(address) == false) {
+      L1Icache.allocateVoid(address, new Entry);
+    }
+  }
+
+  action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+    if (is_invalid(cache_entry)) {
+      set_cache_entry(L2cache.allocate(address, new Entry));
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    assert(is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.DataBlk := cache_entry.DataBlk;  // Data only used for WBs
+    tbe.Dirty := cache_entry.Dirty;
+    tbe.Shared := false;
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+    mandatoryQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+    responseToCore_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+    Entry entry := getICacheEntry(address);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer.readCallback(address,
+                           l2entry.DataBlk,
+                           true,
+                           testAndClearLocalHit(entry));
+  }
+
+  action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    Entry l2entry := getCacheEntry(address); // Used for functional accesses
+    assert(is_valid(entry));
+    // L2 supplies data (functional accesses only look in L2, ok because L1
+    //                   writes through to L2)
+    sequencer1.readCallback(address,
+                            l2entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+  }
+
+  action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n",
+              address, l2entry.DataBlk);
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+   peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+   }
+  }
+
+  action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer.readCallback(address,
+                             l2entry.DataBlk,
+                             false,
+                             machineIDToMachineType(in_msg.Sender),
+                             in_msg.InitialRequestTime,
+                             in_msg.ForwardRequestTime,
+                             in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+    peek(responseToCore_in, ResponseMsg) {
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+              (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      Entry l2entry := getCacheEntry(address); // Used for functional accesses
+      // L2 supplies data (functional accesses only look in L2, ok because L1
+      //                   writes through to L2)
+      assert(is_valid(l2entry));
+      sequencer1.readCallback(address,
+                              l2entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+    }
+  }
+
+  action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer.writeCallback(address,
+                            cache_entry.DataBlk,
+                            true,
+                            testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.Dirty := true;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    sequencer1.writeCallback(address,
+                             cache_entry.DataBlk,
+                             true,
+                             testAndClearLocalHit(entry));
+    cache_entry.Dirty := true;
+    entry.Dirty := true;
+    entry.DataBlk := cache_entry.DataBlk;
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+  }
+
+  action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer.writeCallback(address,
+                              cache_entry.DataBlk,
+                              false,
+                              machineIDToMachineType(in_msg.Sender),
+                              in_msg.InitialRequestTime,
+                              in_msg.ForwardRequestTime,
+                              in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+             (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+      sequencer1.writeCallback(address,
+                               cache_entry.DataBlk,
+                               false,
+                               machineIDToMachineType(in_msg.Sender),
+                               in_msg.InitialRequestTime,
+                               in_msg.ForwardRequestTime,
+                               in_msg.ProbeRequestStartTime);
+      cache_entry.Dirty := true;
+      entry.Dirty := true;
+      entry.DataBlk := cache_entry.DataBlk;
+      DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    }
+  }
+
+  action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer.evictionCallback(address);
+    }
+  }
+
+  action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+      sequencer1.evictionCallback(address);
+    }
+  }
+
+  action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+    Entry entry := getICacheEntry(address);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 0);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:StaleNotif;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+    Entry entry := getL1CacheEntry(address, 1);
+    assert(is_valid(entry));
+    assert(is_valid(cache_entry));
+    entry.Dirty := cache_entry.Dirty;
+    entry.DataBlk := cache_entry.DataBlk;
+    entry.FromL2 := true;
+  }
+
+  action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1I;
+    }
+  }
+
+  action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D0;
+    }
+  }
+
+  action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+    enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+      out_msg.addr := address;
+      out_msg.Type := TriggerType:L2_to_L1;
+      out_msg.Dest := CacheId:L1D1;
+    }
+  }
+
+  action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getICacheEntry(address);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 0);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+    peek(responseToCore_in, ResponseMsg) {
+      Entry entry := getL1CacheEntry(address, 1);
+      assert(is_valid(entry));
+      assert(is_valid(cache_entry));
+      entry.DataBlk := in_msg.DataBlk;
+      entry.Dirty := in_msg.Dirty;
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.Dirty := in_msg.Dirty;
+    }
+  }
+
+  action(wb_data, "wb", desc="write back data") {
+    peek(responseToCore_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:CPUData;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_Directory(address));
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        if (tbe.Shared) {
+          out_msg.NbReqShared := true;
+        } else {
+          out_msg.NbReqShared := false;
+        }
+        out_msg.State := CoherenceState:Shared; // faux info
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Hit := false;
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.Dirty := false;
+      out_msg.Ntsl := true;
+      out_msg.Hit := false;
+      APPEND_TRANSITION_COMMENT("Setting Ms");
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      assert(addressInCore(address) || is_valid(tbe));
+      out_msg.Dirty := false;  // only true if sending back data i think
+      out_msg.Hit := true;
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      if (addressInCore(address)) {
+        out_msg.Hit := true;
+      } else {
+        out_msg.Hit := false;
+      }
+      out_msg.Dirty := false;  // not sending back data, so def. not dirty
+      out_msg.Ntsl := false;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+      out_msg.DataBlk := cache_entry.DataBlk;
+      assert(cache_entry.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      APPEND_TRANSITION_COMMENT("Setting Ms");
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      assert(is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.DataBlk := tbe.DataBlk;
+      assert(tbe.Dirty);
+      out_msg.Dirty := true;
+      out_msg.Hit := true;
+      out_msg.State := CoherenceState:NA;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.isValid := isValid(address);
+    }
+  }
+
+  action(ra_sendReplAck, "ra", desc="Send ack to r-buf that line is replaced if needed") {
+    if (is_invalid(tbe) || tbe.AckNeeded) {
+      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceRequestType:InvAck;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(getPeer(machineID));
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+      }
+      APPEND_TRANSITION_COMMENT(" Sending ack to r-buf ");
+    } else {
+      APPEND_TRANSITION_COMMENT(" NOT Sending ack to r-buf ");
+    }
+  }
+
+  action(m_markAckNeeded, "m", desc="Mark TBE to send ack when deallocated") {
+    assert(is_valid(tbe));
+    tbe.AckNeeded := true;
+  }
+
+  action(mc_cancelWB, "mc", desc="send writeback cancel to L3") {
+    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUCancelWB;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.Sender := machineID;
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+    assert(is_valid(tbe));
+    tbe.Shared := true;
+  }
+
+  action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+    enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      out_msg.wasValid := isValid(address);
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sdv_sendDoneValid, "sdv", desc="Request finished, send done ack") {
+    enqueue(unblockNetwork_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else if (is_valid(cache_entry)) {
+          out_msg.Dirty := cache_entry.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      out_msg.validToInvalid := false;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(sdi_sendDoneInvalid, "sdi", desc="Request finished, send done ack") {
+    enqueue(unblockNetwork_out, UnblockMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Destination.add(getPeer(machineID));
+      out_msg.DoneAck := true;
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      if (is_valid(tbe)) {
+          out_msg.Dirty := tbe.Dirty;
+      } else if (is_valid(cache_entry)) {
+          out_msg.Dirty := cache_entry.Dirty;
+      } else {
+          out_msg.Dirty := false;
+      }
+      out_msg.validToInvalid := true;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+    ++L1D0cache.demand_misses;
+  }
+
+  action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+    ++L1D1cache.demand_misses;
+  }
+
+  action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+    ++L1Icache.demand_misses;
+  }
+
+  action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+    ++L2cache.demand_misses;
+  }
+
+  action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+  // END ACTIONS
+
+  // BEGIN TRANSITIONS
+
+  // transitions from base
+  transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    l1im_profileMiss;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // since in I state, L2 miss as well
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+    // track misses, if implemented
+    // L2 miss as well
+    l10m_profileMiss;
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+     l11m_profileMiss;
+    // track misses, if implemented
+    // L2 miss as well
+    l2m_profileMiss;
+    l1im_profileMiss;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    a2_allocateL2;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    a2_allocateL2;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,  L2TagArrayRead, L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    l1im_profileMiss;
+    ai_allocateL1I;
+    fi_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead}{
+    l2m_profileMiss;
+    l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead,L2TagArrayRead} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+  transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {  // can this be folded with S_F?
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+      l1im_profileMiss;
+     i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+     l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    ib_invBothClusters;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+  transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayWrite,L1D0TagArrayRead, L2TagArrayRead, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;   // instantaneous L1/L2 dirty - no writethrough delay
+    p_popMandatoryQueue;
+  }
+
+  transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Load_L1miss, E0_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+     l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead } {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i0_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E0, C1_Store_L1miss, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+    a1_allocateL1D;
+    l11m_profileMiss;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    a1_allocateL1D;
+    l11m_profileMiss;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+    a0_allocateL1D;
+    l10m_profileMiss;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead,L1ITagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l1im_profileMiss;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    i1_invCluster;
+    nS_issueRdBlkS;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue CtoD
+     l10m_profileMiss;
+    a0_allocateL1D;
+    i1_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+    l2m_profileMiss; // permissions miss, still issue RdBlkS
+    l11m_profileMiss;
+    a1_allocateL1D;
+    i0_invCluster;
+    ii_invIcache;
+    nM_issueRdBlkM;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead,  L2TagArrayRead} {
+    l2m_profileMiss;  // permissions miss
+    l1im_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead L2TagArrayRead } {
+    l2m_profileMiss;  // permissions miss
+    l10m_profileMiss;
+    ai_allocateL1I;
+    t_allocateTBE;
+    ib_invBothClusters;
+    vd_victim;
+//    i2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+   l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D1TagArrayRead} {
+   l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+    a0_allocateL1D;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    a1_allocateL1D;
+    i0_invCluster;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead L2TagArrayRead, L2DataArrayRead} {
+   l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+    a0_allocateL1D;
+    i1_invCluster;
+    s0_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite} {
+    a1_allocateL1D;
+    s1_storeDone;
+    p_popMandatoryQueue;
+  }
+
+  // end transitions from base
+
+  // Begin simple hit transitions
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+          Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+    // track hits, if implemented
+    l0_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+          Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+    // track hits, if implemented
+    l1_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+    // track hits, if implemented
+    il0_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+    // track hits, if implemented
+    il1_loadDone;
+    p_popMandatoryQueue;
+  }
+
+  // end simple hit transitions
+
+  // Transitions from transient states
+
+  // recycles
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+          O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+          O_M0, S0, S1, I_C, S0_C, S1_C, S_C},  C1_Load_L1miss) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+          IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+          O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+          E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+          S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+          Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+          M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+          Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+          M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+          O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+          M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+          IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+          E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+          IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+          E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+    zz_recycleMandatoryQueue;
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+          PrbInvData, PrbInvDataDemand, PrbInv, PrbShrData, PrbShrDataDemand}) {} {
+    zz_recycleMandatoryQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+    zz_recycleMandatoryQueue;  // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+  }
+
+  transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l10m_profileMiss;
+    a0_allocateL1D;
+    f0_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Load_L1miss, I_M0Ms){
+    l11m_profileMiss;
+    l2m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Load_L1miss, I_M1Ms){
+    l10m_profileMiss;
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M0, C1_Store_L1miss, I_M0M1) {
+    l11m_profileMiss;
+    l2m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_M1, C0_Store_L1miss, I_M1M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l11m_profileMiss;
+    a1_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+    l2m_profileMiss;
+    l10m_profileMiss;
+    l2m_profileMiss;
+    a0_allocateL1D;
+    p_popMandatoryQueue;
+  }
+
+  transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(S_F0, C1_Load_L1miss, S_F) { L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) {L2DataArrayRead} {
+    l11m_profileMiss;
+    a1_allocateL1D;
+    f1_L2ToL1;
+    p_popMandatoryQueue;
+  }
+
+  transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+    i0_invCluster;
+  }
+
+  transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+    i1_invCluster;
+  }
+
+  transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+    ii_invIcache;
+  }
+
+  transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead,L1D0TagArrayRead, L1D1TagArrayRead, L1ITagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vc_victim;
+    ib_invBothClusters;
+    i2_invL2;
+    ii_invIcache;
+  }
+
+  transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayRead, L1D1TagArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    t_allocateTBE;
+    vd_victim;
+    i2_invL2;
+    ib_invBothClusters;  // nothing will happen for D0 on M1, vice versa
+  }
+
+  transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckS, S_C) { L1IDataArrayWrite,L2DataArrayWrite} {
+    // does not need send done since the rdblks was "sinked"
+    wi_writeIcache;
+    xi0_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckS, S_C) { L1D1DataArrayWrite,L2DataArrayWrite} {
+    wi_writeIcache;
+    xi1_loadDone;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M0, NB_AckM, M0) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  // THESE MO->M1 should not be instantaneous but oh well for now.
+  transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    i0_invCluster;
+    s1_storeDone;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    i1_invCluster;
+    s0_storeDone;
+    pr_popResponseQueue;
+  }
+
+  // Above shoudl be more like this, which has some latency to xfer to L1
+  transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+    w0_writeDcache;
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    f1_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite,L2DataArrayWrite} {
+    w1_writeDcache;
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    f0_L2ToL1;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite,  L2DataArrayWrite} {
+    w0_writeDcache;
+    xl0_loadDone;
+    w1_writeDcache;
+    xl1_loadDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    il0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite,  L2TagArrayWrite, L2DataArrayRead} {
+    ci_copyL2ToL1;
+    il1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c0_copyL2ToL1;
+    l0_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+    c1_copyL2ToL1;
+    l1_loadDone;
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S0, L2_to_L1I, S0) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(F_S1, L2_to_L1I, S1) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    xs0_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+    xs1_storeDone;
+    sdv_sendDoneValid;
+    uu_sendUnblock;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    ra_sendReplAck;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+    wb_data;
+    ra_sendReplAck;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    sdv_sendDoneValid;
+    nS_issueRdBlkS;
+    d_deallocateTBE; // FOO
+    pr_popResponseQueue;
+  }
+
+  transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+    wb_data;
+    i2_invL2;
+    a2_allocateL2;
+    sdv_sendDoneValid;
+    nS_issueRdBlkS;
+    d_deallocateTBE; // FOO
+    pr_popResponseQueue;
+  }
+
+  // Writeback cancel "ack"
+  transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdi_sendDoneInvalid;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+    ss_sendStaleNotification;
+    sdv_sendDoneValid;
+    pr_popResponseQueue;
+  }
+
+  // Begin Probe Transitions
+
+  transition({Ms, M0, M1, O}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    i2_invL2;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S, I}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;  // only relevant for S
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, {PrbInvData, PrbInvDataDemand}, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2; // nothing will happen in I
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+    t_allocateTBE;
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(I_C, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms, M0, M1, O}, {PrbShrData, PrbShrDataDemand}, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Es, E0, E1, S}, {PrbShrData, PrbShrDataDemand}, S) {L2TagArrayRead, L2TagArrayWrite} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_C, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({I, I_C}, {PrbShrData, PrbShrDataDemand}) {L2TagArrayRead} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;  // must invalidate current data (only relevant for I_M0)
+    a0_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M1, I_E1S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+    a1_allocateL1D;  // but make sure there is room for incoming data when it arrives
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbInvDataDemand, PrbShrData, PrbShrDataDemand}) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    pp_popProbeQueue;
+  }
+
+  transition({I_M0, I_E0S, I_M1, I_E1S}, {PrbShrData, PrbShrDataDemand}) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, PrbInv, I_C) {} {
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    pp_popProbeQueue;
+  }
+
+  transition(ES_I, {PrbShrData, PrbShrDataDemand}, ES_I) {} {
+    ph_sendProbeResponseHit;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_I, {PrbShrData, PrbShrDataDemand}, MO_I) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, {PrbInvData, PrbInvDataDemand}, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, {PrbInvData, PrbInvDataDemand}, S1_C) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdt_sendProbeResponseDataFromTBE;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    i2_invL2;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    d_deallocateTBE;
+    pp_popProbeQueue;
+  }
+
+  transition({MO_S0, MO_S1}, {PrbShrData, PrbShrDataDemand}) {} {
+    pdt_sendProbeResponseDataFromTBE;
+    s_setSharedFlip;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F, Es_F}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    // invalidate everything you've got
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    // but make sure you have room for what you need from the fill
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F0, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition(Si_F1, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    nS_issueRdBlkS;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F0, E0_F, E1_Es}, {PrbShrData, PrbShrDataDemand}, S_F0) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({Es_F1, E1_F, E0_Es}, {PrbShrData, PrbShrDataDemand}, S_F1) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(Es_F, {PrbShrData, PrbShrDataDemand}, S_F) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(S_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition(O_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pdm_sendProbeResponseDataMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pim_sendProbeResponseInvMs;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S0_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S1, S1_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    ii_invIcache;
+    i2_invL2;
+    ai_allocateL1I;
+    a2_allocateL2;
+    pp_popProbeQueue;
+  }
+
+  transition({S_M0, S_M1}, {PrbShrData, PrbShrDataDemand}) {} {
+    ph_sendProbeResponseHit;
+    pp_popProbeQueue;
+  }
+
+  transition({O_M0, O_M1}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({S0, S1, S0_C, S1_C}, {PrbShrData, PrbShrDataDemand}) {} {
+    pb_sendProbeResponseBackprobe;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, {PrbInvData, PrbInvDataDemand}, IF_E0S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, {PrbInvData, PrbInvDataDemand}, IF_E1S) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, {PrbInvData, PrbInvDataDemand}, IF_ES) {L2DataArrayRead} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pd_sendProbeResponseData;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+    forward_eviction_to_cpu0;
+    forward_eviction_to_cpu1;
+    pi_sendProbeResponseInv;
+    ib_invBothClusters;
+    i2_invL2;
+    a0_allocateL1D;
+    a1_allocateL1D;
+    a2_allocateL2;
+    n_issueRdBlk;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F0, M0_F, M1_Ms}, {PrbShrData, PrbShrDataDemand}, O_F0) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({Ms_F1, M1_F, M0_Ms}, {PrbShrData, PrbShrDataDemand}, O_F1) {} {
+  }
+
+  transition({Ms_F}, {PrbShrData, PrbShrDataDemand}, O_F) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  transition({O_F0, O_F1, O_F}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+    pd_sendProbeResponseData;
+    pp_popProbeQueue;
+  }
+
+  // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
new file mode 100644
index 000000000..52d87fb8b
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
@@ -0,0 +1,2038 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD_Base-like protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  Cycles response_latency := 5;
+  Cycles response_latency_regionDir := 1;
+  Cycles l3_hit_latency := 30;
+  bool useL3OnWT := "False";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+  // To the Cores
+  MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+  // From region buffer
+  MessageBuffer * reqFromRegBuf, network="From", virtual_network="7", vnet_type="request";
+
+  // To Region directory
+  MessageBuffer * reqToRegDir, network="To", virtual_network="5", vnet_type="request";
+  MessageBuffer * reqFromRegDir, network="From", virtual_network="5", vnet_type="request";
+  MessageBuffer * unblockToRegDir, network="To", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * triggerQueue;
+  MessageBuffer * L3triggerQueue;
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BR, AccessPermission:Backing_Store,                  desc="got CPU read request, blocked while sent to L3";
+    BW, AccessPermission:Backing_Store,                  desc="got CPU write request, blocked while sent to L3";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it's possible for the data only to be in the network
+    // in the WB, L3 has sent it and gone on with its business in possibly I
+    // state.
+    BI, AccessPermission:Backing_Store,                   desc="Blocked waiting for inv ack from core";
+    BS_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,                  desc="sent response, Blocked til ack";
+
+    // These are needed for when a private requests was issued before an inv was received
+    // for writebacks
+    BS_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm_BL, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BP_BL, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    // for reads
+    BS_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm_B, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BP_B, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    RdBlkSP,             desc="...";
+    RdBlkMP,             desc="...";
+    RdBlkP,              desc="...";
+    VicDirtyP,           desc="...";
+    VicCleanP,           desc="...";
+    WriteThroughP,       desc="WriteThrough Message";
+    AtomicP,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,            desc="WB response for a no longer valid request";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+    LastCPUPrbResp,        desc="Last Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="unblock, self triggered";
+
+    StaleVicDirty,       desc="Core invalidated before VicDirty processed";
+    StaleVicDirtyP,       desc="Core invalidated before VicDirty processed";
+
+    // For region protocol
+    CPUReq,              desc="Generic CPU request";
+    Inv,                 desc="Region dir needs a block invalidated";
+    Downgrade,           desc="Region dir needs a block downgraded";
+
+    // For private accesses (bypassed reg-dir)
+    CPUReadP,            desc="Initial req from core, sent to L3";
+    CPUWriteP,           desc="Initial req from core, sent to L3";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    DataBlock DataBlkAux,  desc="Auxiliary data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    bool DemandRequest, desc="for profiling";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    bool TriggeredAcksComplete, default="false", desc="True if already triggered acks complete";
+    WriteMask writeMask,    desc="outstanding write through mask";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  State getStateFromAddr(Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if(directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(requestNetworkReg_out, CPURequestMsg, reqToRegDir);
+  out_port(regAckNetwork_out, UnblockMsg, unblockToRegDir);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=7) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=6) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=5) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=4) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+          DPRINTF(RubySlicc, "core responses %s\n", in_msg);
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          if (is_valid(tbe) && tbe.NumPendingAcks == 1
+            && tbe.TriggeredAcksComplete == false) {
+            trigger(Event:LastCPUPrbResp, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=3) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(regBuf_in, CPURequestMsg, reqFromRegBuf, rank=2) {
+    if (regBuf_in.isReady(clockEdge())) {
+      peek(regBuf_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:ForceInv) {
+          trigger(Event:Inv, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:ForceDowngrade) {
+          trigger(Event:Downgrade, in_msg.addr, entry, tbe);
+        } else {
+          error("Bad request from region buffer");
+        }
+      }
+    }
+  }
+
+  in_port(regDir_in, CPURequestMsg, reqFromRegDir, rank=1) {
+    if (regDir_in.isReady(clockEdge())) {
+      peek(regDir_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad message type fwded from Region Dir");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Private) {
+          // Bypass the region dir
+          if (in_msg.Type == CoherenceRequestType:RdBlk) {
+            trigger(Event:RdBlkP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+            trigger(Event:RdBlkSP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+            trigger(Event:RdBlkMP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+            trigger(Event:AtomicP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+            trigger(Event:WriteThroughP, in_msg.addr, entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+              DPRINTF(RubySlicc, "Dropping VicDirtyP for address %s\n", in_msg.addr);
+              trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+            } else {
+              DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+              trigger(Event:VicDirtyP, in_msg.addr, entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+            if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+              DPRINTF(RubySlicc, "Dropping VicCleanP for address %s\n", in_msg.addr);
+              trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+            } else {
+              DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+              trigger(Event:VicCleanP, in_msg.addr, entry, tbe);
+            }
+          } else {
+            error("Bad message type for private access");
+          }
+        } else {
+          trigger(Event:CPUReq, in_msg.addr, entry, tbe);
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := tbe.DemandRequest;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := tbe.DemandRequest;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    } else {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.DemandRequest := tbe.DemandRequest;
+        out_msg.L3Hit := tbe.L3Hit;
+        if (tbe.atomicData) {
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(sb_sendResponseSBypass, "sb", desc="send Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(esb_sendResponseESBypass, "esb", desc="send Exclusive or Shared response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached || in_msg.ForceShared) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(mbwt_sendResponseWriteThroughBypass, "mbwt", desc="send write through response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:NBSysWBAck;
+          out_msg.Destination.add(in_msg.Requestor);
+          out_msg.WTRequestor := in_msg.WTRequestor;
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := curCycle();
+          out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+          out_msg.DemandRequest := false;
+        }
+      } else {
+        assert(in_msg.Type == CoherenceRequestType:Atomic);
+        enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:NBSysResp;
+          if (tbe.L3Hit) {
+            out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+          } else {
+            out_msg.Sender := machineID;
+          }
+          out_msg.Destination.add(in_msg.Requestor);
+          out_msg.DataBlk := getDirectoryEntry(address).DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := in_msg.Dirty;
+          out_msg.State := CoherenceState:Modified;
+          out_msg.CtoD := false;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := curCycle();
+          out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+          out_msg.OriginalResponder := tbe.LastSender;
+          out_msg.DemandRequest := false;
+          out_msg.L3Hit := tbe.L3Hit;
+          out_msg.WTRequestor := in_msg.WTRequestor;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+      }
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }
+  }
+
+  action(mb_sendResponseMBypass, "mb", desc="send Modified response") {
+    peek(requestNetwork_in, CPURequestMsg) {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(in_msg.Requestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      out_msg.State := CoherenceState:Modified;
+      out_msg.CtoD := false;
+      out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      out_msg.ForwardRequestTime := curCycle();
+      out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.DemandRequest := false;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.DemandRequest := tbe.DemandRequest;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(cp_sendResponseCtoDP, "cp", desc="send CtoD Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(regDir_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+      }
+    }
+  }
+
+  action(wp_sendResponseWBAckP, "wp", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+        out_msg.DemandRequest := false;
+      }
+    }
+  }
+
+  action(wc_sendResponseWBAck, "wc", desc="send WB Ack for cancel") {
+    peek(responseNetwork_in, ResponseMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Sender);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+      }
+    }
+  }
+
+  action(ra_ackRegionDir, "ra", desc="Ack region dir") {
+    peek(regDir_in, CPURequestMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(responseNetwork_out, ResponseMsg, response_latency_regionDir) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:DirReadyAck;
+          out_msg.Destination.add(map_Address_to_RegionDir(address));
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        }
+      }
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(regDir_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(lrp_queueMemRdReqP, "lrp", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dcr_probeInvCoreData, "dcr", desc="probe inv cores, return data") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := in_msg.Sharers;
+        tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ddr_probeDownCoreData, "ddr", desc="probe inv cores, return data") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination := in_msg.Sharers;
+        tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+        out_msg.Destination.broadcast(MachineType:TCP);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+        out_msg.Destination.broadcast(MachineType:SQC);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+        out_msg.Destination.remove(in_msg.Requestor);
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+        out_msg.Destination.broadcast(MachineType:TCP);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+        out_msg.Destination.broadcast(MachineType:SQC);
+        tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+        out_msg.Destination.remove(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+              in_msg.addr);
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(regDir_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.ForwardRequestTime := curCycle();
+      tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      tbe.DemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(tp_allocateTBEP, "tp", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.ForwardRequestTime := curCycle();
+      tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+      tbe.DemandRequest := false;
+    }
+  }
+
+  action(sa_setAcks, "sa", desc="setAcks") {
+    peek(regDir_in, CPURequestMsg) {
+        tbe.NumPendingAcks := in_msg.Acks;
+        APPEND_TRANSITION_COMMENT(" waiting for acks ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+    }
+  }
+
+  action(tr_allocateTBE, "tr", desc="allocate TBE Entry for Region inv") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+    tbe.NumPendingAcks := 0;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wdp_writeBackDataPrivate, "wdp", desc="Write back data if needed") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlkAux := getDirectoryEntry(address).DataBlk;
+        tbe.DataBlkAux.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+        getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+      } else{
+        assert(in_msg.Type == CoherenceRequestType:Atomic);
+        tbe.DataBlkAux.atomicPartial(getDirectoryEntry(address).DataBlk,in_msg.writeMask);
+        getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+      }
+    }
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      DataBlock tmp := getDirectoryEntry(address).DataBlk;
+      tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+      tbe.DataBlk := tmp;
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == true) {
+      APPEND_TRANSITION_COMMENT(" Wrote data back ");
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(wdi_writeBackDataInv, "wdi", desc="Write back inv data if needed") {
+    // Kind of opposite from above...?
+    if (tbe.Dirty == true) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+      APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+      DPRINTF(RubySlicc, "Data %s: %s\n", address, tbe.DataBlk);
+    } else {
+      APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+    }
+  }
+
+  action(wdt_writeBackDataInvNoTBE, "wdt", desc="Write back inv data if needed no TBE") {
+    // Kind of opposite from above...?
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty == true) {
+        getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+        APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+        DPRINTF(RubySlicc, "Data %s: %s\n", address, in_msg.DataBlk);
+      } else {
+        APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+      }
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(ml_writeL3DataToTBE, "ml", desc="write L3 data to TBE") {
+    assert(tbe.Dirty == false);
+    CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+    tbe.DataBlk := entry.DataBlk;
+    tbe.LastSender := entry.LastSender;
+    tbe.L3Hit := true;
+    tbe.MemData := true;
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(yc_writeCPUDataToTBE, "yc", desc="write CPU Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.Dirty) {
+          DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+          assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+        }
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := false;
+        tbe.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    if (tbe.NumPendingAcks > 0) {
+      tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    } else {
+      APPEND_TRANSITION_COMMENT(" Double ack! ");
+    }
+    assert(tbe.NumPendingAcks >= 0);
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+      tbe.TriggeredAcksComplete := true;
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(ont_checkForCompletionNoTrigger, "ont", desc="check for ack completion, no trigger") {
+    if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+      tbe.TriggeredAcksComplete := true;
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rvp_removeVicDirtyIgnore, "rvp", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(regDir_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(r_sendRequestToRegionDir, "r", desc="send request to Region Directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetworkReg_out, CPURequestMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(map_Address_to_RegionDir(address));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        DPRINTF(RubySlicc, "out dest: %s\n", map_Address_to_RegionDir(address));
+      }
+    }
+  }
+
+  action(ai_ackInvalidate, "ai", desc="Ack to let the reg-dir know that the inv is ordered") {
+    peek(regBuf_in, CPURequestMsg) {
+      enqueue(regAckNetwork_out, UnblockMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+      }
+    }
+  }
+
+  action(aic_ackInvalidate, "aic", desc="Ack to let the reg-dir know that the inv is ordered") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(regAckNetwork_out, UnblockMsg, 1) {
+          out_msg.addr := address;
+          if (machineIDToMachineType(in_msg.Sender) == MachineType:CorePair) {
+            out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(0)));
+          } else {
+            out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(1)));
+          }
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+          DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+          out_msg.wasValid := in_msg.isValid;
+        }
+      }
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(ali_allocateL3Block, "ali", desc="allocate the L3 block on ForceInv") {
+    if (tbe.Dirty == true) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(ali_allocateL3BlockNoTBE, "alt", desc="allocate the L3 block on ForceInv no TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        if (L3CacheMemory.isTagPresent(address)) {
+          CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+          APPEND_TRANSITION_COMMENT(" ali wrote data to L3 (hit) ");
+          entry.DataBlk := in_msg.DataBlk;
+          entry.LastSender := in_msg.Sender;
+        } else {
+          if (L3CacheMemory.cacheAvail(address) == false) {
+            Addr victim := L3CacheMemory.cacheProbe(address);
+            CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                   L3CacheMemory.lookup(victim));
+            queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                             victim_entry.DataBlk);
+            L3CacheMemory.deallocate(victim);
+          }
+          assert(L3CacheMemory.cacheAvail(address));
+          CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+          APPEND_TRANSITION_COMMENT(" ali wrote data to L3 ");
+          entry.DataBlk := in_msg.DataBlk;
+          entry.LastSender := in_msg.Sender;
+        }
+      }
+    }
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(prd_popRegionQueue, "prd", desc="pop request queue") {
+    regDir_in.dequeue(clockEdge());
+  }
+
+  action(prb_popRegionBufQueue, "prb", desc="pop request queue") {
+    regBuf_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(ww_stallAndWaitRegRequestQueue, "ww", desc="recycle region dir request queue") {
+    stall_and_wait(regDir_in, address);
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+
+  // transitions from U
+
+  transition({BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Inv, Downgrade}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition(U, Inv, BI){L3TagArrayRead} {
+    tr_allocateTBE;
+    dcr_probeInvCoreData; // only need to invalidate sharers
+    ai_ackInvalidate;
+    prb_popRegionBufQueue;
+  }
+
+  transition(U, Downgrade, BI){L3TagArrayRead} {
+    tr_allocateTBE;
+    ddr_probeDownCoreData; // only need to invalidate sharers
+    ai_ackInvalidate;
+    prb_popRegionBufQueue;
+  }
+
+  // The next 2 transistions are needed in the event that an invalidation
+  // is waiting for its ack from the core, but the event makes it through
+  // the region directory before the acks. This wouldn't be needed if
+  // we waited to ack the region dir until the directory got all the acks
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, WriteThrough, Atomic}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {RdBlkSP, RdBlkMP, RdBlkP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {WriteThroughP,AtomicP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, {RdBlkM,Atomic}, BM_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, RdBlk, B_PM){L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sa_setAcks;
+    o_checkForCompletion;
+    ra_ackRegionDir;
+    prd_popRegionQueue;
+  }
+
+  transition(U, {RdBlkSP}, BS_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThroughP, BM_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkMP,AtomicP}, BM_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlkP, B_M) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    lrp_queueMemRdReqP;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirtyP, BL) {L3TagArrayRead}  {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicCleanP, BL) {L3TagArrayRead} {
+    tp_allocateTBEP;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkSP, BM_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkSP, BS_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkSP, B_Pm_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkSP, BP_B) {L3DataArrayWrite} {
+    sb_sendResponseSBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkMP, BM_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkMP, BS_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkMP, B_Pm_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkMP, BP_B) {L3DataArrayWrite} {
+    mb_sendResponseMBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, {WriteThroughP,AtomicP}, BM_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, {WriteThroughP,AtomicP}, BS_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, {WriteThroughP,AtomicP}, B_Pm_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, {WriteThroughP,AtomicP}, BP_B) {L3DataArrayWrite} {
+    wdp_writeBackDataPrivate;
+    mbwt_sendResponseWriteThroughBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, RdBlkP, BM_Pm_B) {L3DataArrayWrite} {
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, RdBlkP, BS_Pm_B) {L3DataArrayWrite} {
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, RdBlkP, B_Pm_B)  {L3DataArrayWrite}{
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BP, RdBlkP, BP_B)  {L3DataArrayWrite}{
+    esb_sendResponseESBypass;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm_B, CoreUnblock, BM_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BS_Pm_B, CoreUnblock, BS_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B_Pm_B, CoreUnblock, B_Pm) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BP_B, CoreUnblock, BP) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(BM_Pm_B, UnblockWriteThrough, BM_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm_B, UnblockWriteThrough, BS_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm_B, UnblockWriteThrough, B_Pm) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP_B, UnblockWriteThrough, BP) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, VicDirtyP, BM_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, VicDirtyP, BS_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, VicDirtyP, B_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BP, VicDirtyP, BP_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm, VicCleanP, BM_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BS_Pm, VicCleanP, BS_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(B_Pm, VicCleanP, B_Pm_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BP, VicCleanP, BP_BL) {
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition(BM_Pm_BL, CPUData, BM_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_Pm_BL, CPUData, BS_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(B_Pm_BL, CPUData, B_Pm) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BP_BL, CPUData, BP) {
+    yc_writeCPUDataToTBE;
+    d_writeDataToMemory;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({BR, BW, BL}, {VicDirtyP, VicCleanP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({BR, BW, BL}, {VicDirty, VicClean}) {
+      ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    dt_deallocateTBE;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({BI, B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirty, VicClean}) {
+    ww_stallAndWaitRegRequestQueue;
+  }
+
+  transition({BI, B, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirtyP, VicCleanP}) {
+      st_stallAndWaitRequest;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirtyP) {
+    rvp_removeVicDirtyIgnore;
+    wp_sendResponseWBAckP;
+    p_popRequestQueue;
+  }
+
+  transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    prd_popRegionQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+      t_allocateTBE;
+      ra_ackRegionDir;
+      w_sendResponseWBAck;
+      prd_popRegionQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+      t_allocateTBE;
+      ra_ackRegionDir;
+      w_sendResponseWBAck;
+      prd_popRegionQueue;
+  }
+
+  transition({B, BR}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition({B, BR}, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    wa_wakeUpDependents;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {
+    wa_wakeUpDependents;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP, BI}, CPUPrbResp) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition({B, B_M, BS_M, BM_M}, {CPUPrbResp, LastCPUPrbResp}) {
+    z_stall;
+  }
+
+  transition({BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {CPUPrbResp, LastCPUPrbResp}) {
+    // recycling because PrbResponse and data come on the same network
+    yy_recycleResponseQueue;
+  }
+
+  transition(U, {CPUPrbResp, LastCPUPrbResp}) {L3TagArrayRead, L3DataArrayWrite} {
+    aic_ackInvalidate;
+    wdt_writeBackDataInvNoTBE;
+    ali_allocateL3BlockNoTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, {CPUPrbResp, LastCPUPrbResp}) {} {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, LastCPUPrbResp, BS_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, LastCPUPrbResp, BM_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, LastCPUPrbResp, B_M) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    pr_popResponseQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BI, LastCPUPrbResp, B) {
+    aic_ackInvalidate;
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    ont_checkForCompletionNoTrigger;
+    wa_wakeUpDependents;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BI, ProbeAcksComplete, U) {L3TagArrayWrite, L3DataArrayWrite}{
+    wa_wakeUpDependents;
+    wdi_writeBackDataInv;
+    ali_allocateL3Block;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
new file mode 100644
index 000000000..823933e57
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+  // CPU Request Types ONLY
+  RdBlk,        desc="Read Blk";
+  RdBlkM,       desc="Read Blk Modified";
+  RdBlkS,       desc="Read Blk Shared";
+  VicClean,     desc="L2 clean eviction";
+  VicDirty,     desc="L2 dirty eviction";
+
+  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
+
+  WBApproval,   desc="WB Approval";
+
+  // Messages between Dir and R-Dir
+  ForceInv,     desc="Send invalide to the block";
+  ForceDowngrade, desc="Send downgrade to the block";
+  Unblock,      desc="Used to let the dir know a message has been sunk";
+
+  // Messages between R-Dir and R-Buffer
+  PrivateNotify, desc="Let region buffer know it has private access";
+  SharedNotify,  desc="Let region buffer know it has shared access";
+  WbNotify,      desc="Let region buffer know it saw its wb request";
+  Downgrade,     desc="Force the region buffer to downgrade to shared";
+  // Response to R-Dir (probably should be on a different network, but
+  // I need it to be ordered with respect to requests)
+  InvAck,       desc="Let the R-Dir know when the inv has occured";
+
+  PrivateRequest, desc="R-buf wants the region in private";
+  UpgradeRequest, desc="R-buf wants the region in private";
+  SharedRequest,  desc="R-buf wants the region in shared (could respond with private)";
+  CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+  NA,             desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+  PrbDowngrade,    desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbInv,       desc="Probe to Invalidate";
+
+  // For regions
+  PrbRepl,      desc="Force the cache to do a replacement";
+  PrbRegDowngrade, desc="Probe for Status";  // EtoS, MtoO, StoS
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+  NBSysResp,       desc="Northbridge response to CPU Rd request";
+  NBSysWBAck,      desc="Northbridge response ok to WB";
+  TDSysResp,       desc="TCCdirectory response to CPU Rd request";
+  TDSysWBAck,      desc="TCCdirectory response ok to WB";
+  TDSysWBNack,      desc="TCCdirectory response ok to drop";
+  CPUPrbResp,      desc="CPU Probe Response";
+  CPUData,         desc="CPU Data";
+  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
+  CPUCancelWB,     desc="want to cancel WB to Memory";
+  MemData,         desc="Data from Memory";
+
+  // for regions
+  PrivateAck,      desc="Ack that r-buf received private notify";
+  RegionWbAck, desc="Writeback Ack that r-buf completed deallocation";
+  DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+  Modified,             desc="Modified";
+  Owned,                desc="Owned state";
+  Exclusive,            desc="Exclusive";
+  Shared,               desc="Shared";
+  NA,                   desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  Addr DemandAddress,       desc="Physical block address for this request";
+  CoherenceRequestType Type,   desc="Type of request";
+  DataBlock DataBlk,           desc="data for the cache line";  // only for WB
+  bool Dirty,                   desc="whether WB data is dirty";  // only for WB
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest Destination,             desc="Multicast destination mask";
+  bool Shared,                  desc="For CPU_WrVicBlk, vic is O not M.  For CPU_ClVicBlk, vic is S";
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  NetDest Sharers,              desc="Caches that may have a valid copy of the data";
+  bool ForceShared,             desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+  bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+  bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+  bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+  int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceRequestType:VicDirty) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="probe signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+  Addr DemandAddress,        desc="Demand block address for a region request";
+  MachineID Requestor,          desc="Requestor id for 3-hop requests";
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="TD_PrbNxtState signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  bool localCtoD,              desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  MachineID Sender,               desc="Node who sent the data";
+  bool currentOwner, default="false", desc="Is the sender the current owner";
+  bool DoneAck, default="false", desc="Is this a done ack?";
+  bool Dirty, default="false", desc="Was block dirty when evicted";
+  bool wasValid, default="false", desc="Was block valid when evicted";
+  bool valid, default="false", desc="Is block valid";
+  bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  CoherenceResponseType Type,  desc="NB Sys Resp or CPU Response to Probe";
+  MachineID Sender,               desc="Node who sent the data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  // Begin Used Only By CPU Response
+  DataBlock DataBlk,           desc="data for the cache line";
+  bool Hit,                    desc="probe hit valid line";
+  bool Shared,                 desc="True if S, or if NB Probe ReturnData==1 && O";
+  bool Dirty,                  desc="Is the data dirty (different than memory)?";
+  bool Ntsl,                   desc="indicates probed lin will be invalid after probe";
+  bool UntransferredOwner,     desc="pending confirmation of ownership change";
+  // End Used Only By CPU Response
+
+  // Begin NB Response Only
+  CoherenceState State, default=CoherenceState_NA,        desc="What returned data from NB should be in";
+  bool CtoD,                    desc="was the originator a CtoD?";
+  // End NB Response Only
+
+  bool NbReqShared,             desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+  MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+
+  bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  bool isValid, default="false", desc="Is acked block valid";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceResponseType:CPUData ||
+        Type == CoherenceResponseType:MemData) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  NetDest Destination,          desc="Destination (always directory)";
+  MessageSizeType MessageSize, desc="size category of the message";
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+  L2_to_L1,             desc="L2 to L1 fill";
+  AcksComplete,         desc="NB received all needed Acks";
+
+  // For regions
+  InvNext,              desc="Invalidate the next block";
+  PrivateAck,           desc="Loopback ack for machines with no Region Buffer";
+  AllOutstanding,       desc="All outstanding requests have finished";
+  L3Hit,                desc="L3 hit in dir";
+
+  // For region directory once the directory is blocked
+  InvRegion,            desc="Invalidate region";
+  DowngradeRegion,      desc="downgrade region";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+  L1I,          desc="L1 I-cache";
+  L1D0,         desc="L1 D-cache cluster 0";
+  L1D1,         desc="L1 D-cache cluster 1";
+  NA,           desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Address";
+  TriggerType Type,             desc="Type of trigger";
+  CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
new file mode 100644
index 000000000..89f7d6fcb
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
@@ -0,0 +1,1368 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+  bool isOnCPU;
+  int blocksPerRegion := 64; // 4k regions
+  Cycles toDirLatency := 5;     // Latency to fwd requests to directory
+  Cycles toRegionDirLatency := 5; // Latency for requests and acks to directory
+  Cycles nextEvictLatency := 1;   // latency added between each block while evicting region
+  bool noTCCdir := "False";
+  int TCC_select_num_bits := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCore, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCore, network="From", virtual_network="2", vnet_type="response";
+
+  // Requests to the cores or directory
+  MessageBuffer * requestToNetwork, network="To", virtual_network="0", vnet_type="request";
+
+  // From Region-Dir
+  MessageBuffer * notifyFromRegionDir, network="From", virtual_network="7", vnet_type="request";
+  MessageBuffer * probeFromRegionDir, network="From", virtual_network="8", vnet_type="request";
+
+  // From the directory
+  MessageBuffer * unblockFromDir, network="From", virtual_network="4", vnet_type="unblock";
+
+  // To the region-Dir
+  MessageBuffer * responseToRegDir, network="To", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * triggerQueue;
+{
+
+  // States
+  state_declaration(State, desc="Region states", default="RegionBuffer_State_NP") {
+    NP, AccessPermission:Invalid,       desc="Not present in region directory";
+    P,  AccessPermission:Invalid,       desc="Region is private to the cache";
+    S,  AccessPermission:Invalid,       desc="Region is possibly shared with others";
+
+    NP_PS, AccessPermission:Invalid,    desc="Intermediate state waiting for notify from r-dir";
+    S_P,  AccessPermission:Invalid,     desc="Intermediate state while upgrading region";
+
+    P_NP, AccessPermission:Invalid,     desc="Intermediate state while evicting all lines in region";
+    P_S,  AccessPermission:Invalid,     desc="Intermediate state while downgrading all lines in region";
+
+    S_NP_PS, AccessPermission:Invalid,  desc="Got an inv in S_P, waiting for all inv acks, then going to since the write is already out there NP_PS";
+    P_NP_NP,  AccessPermission:Invalid,    desc="Evicting region on repl, then got an inv. Need to re-evict";
+
+    P_NP_O, AccessPermission:Invalid,     desc="Waiting for all outstanding requests";
+    P_S_O,  AccessPermission:Invalid,     desc="Waiting for all outstanding requests";
+    S_O,  AccessPermission:Invalid,       desc="Waiting for all outstanding requests";
+    S_NP_PS_O, AccessPermission:Invalid,  desc="Waiting for all outstanding requests";
+
+    SS_P, AccessPermission:Invalid,  desc="Waiting for CPU write that we know is there";
+
+    P_NP_W, AccessPermission:Invalid,     desc="Waiting for writeback ack";
+
+    NP_W, AccessPermission:Invalid,     desc="Got a done ack before request, waiting for that victim";
+  }
+
+  enumeration(Event, desc="Region directory events") {
+    CPURead,        desc="Access from CPU core";
+    CPUWrite,       desc="Access from CPU core";
+    CPUWriteback,       desc="Writeback request from CPU core";
+
+    ReplRegion,     desc="Start a replace on a region";
+
+    PrivateNotify,  desc="Update entry to private state";
+    SharedNotify,   desc="Update entry to shared state";
+    WbNotify,       desc="Writeback notification received";
+    InvRegion,      desc="Start invalidating a region";
+    DowngradeRegion,desc="Start invalidating a region";
+
+    InvAck,         desc="Ack from core";
+
+    DoneAck,        desc="Ack from core that request has finished";
+    AllOutstanding, desc="All outstanding requests have now finished";
+
+    Evict,          desc="Loopback to evict each block";
+    LastAck_PrbResp, desc="Done eviciting all the blocks, got the last ack from core, now respond to region dir";
+    LastAck_CleanWb, desc="Done eviciting all the blocks, got the last ack from core, now start clean writeback (note the dir has already been updated)";
+
+    StallAccess,    desc="Wait for the done ack on the address before proceeding";
+    StallDoneAck,   desc="Wait for the access on the address before proceeding";
+
+    StaleRequest,   desc="Got a stale victim from the cache, fwd it without incrementing outstanding";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  structure(BoolVec, external="yes") {
+    bool at(int);
+    void resize(int);
+    void clear();
+    int size();
+  }
+
+  structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+    Addr addr,        desc="Base address of this region";
+    State RegionState,      desc="Region state";
+    DataBlock DataBlk,      desc="Data for the block (always empty in region buffer)";
+    BoolVec ValidBlocks,    desc="A vector to keep track of valid blocks";
+    int NumValidBlocks,     desc="Number of trues in ValidBlocks to avoid iterating";
+    BoolVec UsedBlocks,     desc="A vector to keep track of blocks ever valid";
+    bool dirty,           desc="Dirty as best known by the region buffer";
+    // This is needed so we don't ack an invalidate until all requests are ordered
+    int NumOutstandingReqs,    desc="Total outstanding private/shared requests";
+    BoolVec OutstandingReqs,   desc="Blocks that have outstanding private/shared requests";
+    bool MustDowngrade,     desc="Set when we got a downgrade before the shd or pvt permissions";
+    Cycles ProbeRequestTime, default="Cycles(0)", desc="Time region dir started the probe";
+    Cycles InitialRequestTime, default="Cycles(0)", desc="Time message was sent to region dir";
+    bool MsgSentToDir,      desc="True if the current request required a message to the dir";
+    bool clearOnDone, default="false", desc="clear valid bit when request completes";
+    Addr clearOnDoneAddr, desc="clear valid bit when request completes";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,         desc="Transient state";
+    //int NumValidBlocks,     desc="Number of blocks valid so we don't have to count a BoolVec";
+    BoolVec ValidBlocks,    desc="A vector to keep track of valid blocks";
+    bool AllAcksReceived,   desc="Got all necessary acks from dir";
+    bool DoneEvicting,      desc="Done iterating through blocks checking for valids";
+    BoolVec AcksReceived,   desc="Received acks for theses blocks\n";
+    bool SendAck,           desc="If true, send an ack to the r-dir at end of inv";
+    ProbeRequestType MsgType, desc="Type of message to send while 'evicting' ";
+    int NumOutstandingReqs,    desc="Total outstanding private/shared requests";
+    BoolVec OutstandingReqs,   desc="Blocks that have outstanding private/shared requests";
+    MachineID Requestor,    desc="Requestor for three hop transactions";
+    bool DemandRequest, default="false", desc="Associated with a demand request";
+    Addr DemandAddress,  desc="Address for the demand request";
+    bool DoneAckReceived, default="false", desc="True if the done ack arrived before the message";
+    Addr DoneAckAddr,     desc="Address of the done ack received early";
+    int OutstandingThreshold, desc="Number of outstanding requests to trigger AllOutstanding on";
+
+    ProbeRequestType NewMsgType, desc="Type of message to send while 'evicting' ";
+    MachineID NewRequestor,    desc="Requestor for three hop transactions";
+    bool NewDemandRequest, default="false", desc="Associated with a demand request";
+    Addr NewDemandAddress,  desc="Address for the demand request";
+    bool dirty, desc="dirty";
+    bool AllOutstandingTriggered, default="false", desc="bit for only one all outstanding";
+    int OutstandingAcks, default="0", desc="number of acks to wait for";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // Stores only region addresses
+  TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  int blockBits,  default="RubySystem::getBlockSizeBits()";
+  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int regionBits, default="log2(m_blocksPerRegion)";
+
+  // Functions
+
+  int getRegionOffset(Addr addr) {
+    if (blocksPerRegion > 1) {
+      Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+      int ret := addressToInt(offset);
+      assert(ret < blocksPerRegion);
+      return ret;
+    } else {
+      return 0;
+    }
+  }
+
+  Addr getRegionBase(Addr addr) {
+    return maskLowOrderBits(addr, blockBits+regionBits);
+  }
+
+  Addr getNextBlock(Addr addr) {
+    Addr a := addr;
+    return makeNextStrideAddress(a, 1);
+  }
+
+  MachineID getPeer(MachineID mach, Addr address) {
+    if (isOnCPU) {
+      return createMachineID(MachineType:CorePair, intToID(0));
+    } else if (noTCCdir) {
+      return mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits);
+    } else {
+      return createMachineID(MachineType:TCCdir, intToID(0));
+    }
+  }
+
+  bool isOutstanding(TBE tbe, Entry cache_entry, Addr addr) {
+      if (is_valid(tbe) && tbe.OutstandingReqs.size() > 0) {
+          DPRINTF(RubySlicc, " outstanding tbe reqs %s %s %d %d\n",
+                  tbe.OutstandingReqs, addr, getRegionOffset(addr),
+                  tbe.OutstandingReqs.at(getRegionOffset(addr)));
+          return tbe.OutstandingReqs.at(getRegionOffset(addr));
+      } else if (is_valid(cache_entry)) {
+          DPRINTF(RubySlicc, " outstanding cache reqs %s %s %d %d\n",
+                  cache_entry.OutstandingReqs, addr, getRegionOffset(addr),
+                  cache_entry.OutstandingReqs.at(getRegionOffset(addr)));
+          return cache_entry.OutstandingReqs.at(getRegionOffset(addr));
+      } else {
+          return false;
+      }
+  }
+
+  bool isOnGPU() {
+    if (isOnCPU) {
+      return false;
+    }
+    return true;
+  }
+
+  bool isRead(CoherenceRequestType type) {
+    return (type == CoherenceRequestType:RdBlk || type == CoherenceRequestType:RdBlkS ||
+            type == CoherenceRequestType:VicClean);
+  }
+
+  bool presentOrAvail(Addr addr) {
+    return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+  }
+
+  // Returns a region entry!
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+  }
+
+  TBE getTBE(Addr addr), return_by_pointer="yes" {
+    return TBEs.lookup(getRegionBase(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(getRegionBase(addr)).DataBlk;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.RegionState;
+    }
+    return State:NP;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+    if (is_valid(cache_entry)) {
+        cache_entry.RegionState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := getTBE(addr);
+    if(is_valid(tbe)) {
+      return RegionBuffer_State_to_permission(tbe.TBEState);
+    }
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return RegionBuffer_State_to_permission(cache_entry.RegionState);
+    }
+    return AccessPermission:NotPresent;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    functionalMemoryRead(pkt);
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    if (functionalMemoryWrite(pkt)) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(RegionBuffer_State_to_permission(state));
+    }
+  }
+
+  void recordRequestType(RequestType stat, Addr addr) {
+    if (stat == RequestType:TagArrayRead) {
+        cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (stat == RequestType:TagArrayWrite) {
+        cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:TagArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  // Overloaded outgoing request nework for both probes to cores and reqeusts
+  // to the directory.
+  // Fix Me: These forwarded requests need to be on a separate virtual channel
+  // to avoid deadlock!
+  out_port(requestNetwork_out, CPURequestMsg, requestToNetwork);
+  out_port(probeNetwork_out, NBProbeRequestMsg, requestToNetwork);
+
+  out_port(responseNetwork_out, ResponseMsg, responseToRegDir);
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=4) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := getTBE(in_msg.addr);
+        DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+        assert(is_valid(tbe));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+            if (tbe.SendAck) {
+                trigger(Event:LastAck_PrbResp, in_msg.addr, cache_entry, tbe);
+            } else {
+                trigger(Event:LastAck_CleanWb, in_msg.addr, cache_entry, tbe);
+            }
+        } else if (in_msg.Type == TriggerType:AllOutstanding) {
+          trigger(Event:AllOutstanding, in_msg.addr, cache_entry, tbe);
+        } else {
+          assert(in_msg.Type == TriggerType:InvNext);
+          trigger(Event:Evict, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromDir, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.DoneAck) {
+          if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+            trigger(Event:DoneAck, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:StallDoneAck, in_msg.addr, cache_entry, tbe);
+          }
+        } else {
+          assert(is_valid(tbe));
+          trigger(Event:InvAck, in_msg.addr, cache_entry, tbe);
+        }
+      }
+    }
+  }
+
+  in_port(probeNetwork_in, NBProbeRequestMsg, probeFromRegionDir, rank=2) {
+    if (probeNetwork_in.isReady(clockEdge())) {
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        assert(getRegionBase(in_msg.addr) == in_msg.addr);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          trigger(Event:InvRegion, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+          trigger(Event:DowngradeRegion, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown probe message\n");
+        }
+      }
+    }
+  }
+
+  in_port(notifyNetwork_in, CPURequestMsg, notifyFromRegionDir, rank=1) {
+    if (notifyNetwork_in.isReady(clockEdge())) {
+      peek(notifyNetwork_in, CPURequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        //Fix Me...add back in: assert(is_valid(cache_entry));
+        if (in_msg.Type == CoherenceRequestType:WbNotify) {
+          trigger(Event:WbNotify, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:SharedNotify) {
+          trigger(Event:SharedNotify, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:PrivateNotify) {
+          trigger(Event:PrivateNotify, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown notify message\n");
+        }
+      }
+    }
+  }
+
+  // In from cores
+  // NOTE: We get the cache / TBE entry based on the region address,
+  //       but pass the block address to the actions
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCore, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (is_valid(tbe) && tbe.DoneAckReceived && tbe.DoneAckAddr == in_msg.addr) {
+            DPRINTF(RubySlicc, "Stale/Stall request %s\n", in_msg.Type);
+          if (in_msg.Type == CoherenceRequestType:VicDirty || in_msg.Type == CoherenceRequestType:VicClean )
+          {
+            trigger(Event:StaleRequest, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+          DPRINTF(RubySlicc, "Stall outstanding request %s\n", in_msg.Type);
+          trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+        } else {
+        if (presentOrAvail(in_msg.addr)) {
+          if (in_msg.Type == CoherenceRequestType:RdBlkM ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:WriteThrough ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else if (in_msg.Type == CoherenceRequestType:Atomic ) {
+            trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+          } else {
+              if (in_msg.Type == CoherenceRequestType:VicDirty ||
+                  in_msg.Type == CoherenceRequestType:VicClean) {
+                  trigger(Event:CPUWriteback, in_msg.addr, cache_entry, tbe);
+              } else {
+                  trigger(Event:CPURead, in_msg.addr, cache_entry, tbe);
+              }
+          }
+        } else {
+          Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+          TBE victim_tbe := getTBE(victim);
+          Entry victim_entry := getCacheEntry(victim);
+          DPRINTF(RubySlicc, "Replacing region %s for %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+          trigger(Event:ReplRegion, victim, victim_entry, victim_tbe);
+        }
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;
+        out_msg.Type := in_msg.Type;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := true;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        if (getState(tbe, cache_entry, address) == State:S) {
+          out_msg.ForceShared := true;
+        }
+        DPRINTF(RubySlicc, "Fwd: %s\n", out_msg);
+        //assert(getState(tbe, cache_entry, address) == State:P || getState(tbe, cache_entry, address) == State:S);
+        if (getState(tbe, cache_entry, address) == State:NP_W) {
+          APPEND_TRANSITION_COMMENT(" fwding stale request: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Type);
+        }
+      }
+    }
+  }
+
+  action(u_updateRegionEntry, "u", desc="Update the entry for profiling") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (is_valid(cache_entry)) {
+        if (in_msg.CtoDSinked == false) {
+          APPEND_TRANSITION_COMMENT(" incr outstanding ");
+          cache_entry.NumOutstandingReqs := 1 + cache_entry.NumOutstandingReqs;
+          assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)) == false);
+          cache_entry.OutstandingReqs.at(getRegionOffset(address)) := true;
+          assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+        } else {
+          APPEND_TRANSITION_COMMENT(" NOT incr outstanding ");
+          assert(in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:RdBlkS);
+        }
+        APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+        if (in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:Atomic ||
+            in_msg.Type == CoherenceRequestType:WriteThrough )
+        {
+          cache_entry.dirty := true;
+        }
+        if (in_msg.Type == CoherenceRequestType:VicDirty ||
+            in_msg.Type == CoherenceRequestType:VicClean) {
+            DPRINTF(RubySlicc, "Got %s for addr %s\n", in_msg.Type, address);
+            //assert(cache_entry.ValidBlocks.at(getRegionOffset(address)));
+            // can in fact be inv if core got an inv after a vicclean before it got here
+            if (cache_entry.ValidBlocks.at(getRegionOffset(address))) {
+                cache_entry.clearOnDone := true;
+                cache_entry.clearOnDoneAddr := address;
+                //cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+                //cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+            }
+        } else {
+            if (cache_entry.ValidBlocks.at(getRegionOffset(address)) == false) {
+              cache_entry.NumValidBlocks := cache_entry.NumValidBlocks + 1;
+            }
+            DPRINTF(RubySlicc, "before valid addr %s bits %s\n",
+                    in_msg.Type, address, cache_entry.ValidBlocks);
+            cache_entry.ValidBlocks.at(getRegionOffset(address)) := true;
+            DPRINTF(RubySlicc, "after valid addr %s bits %s\n",
+                    in_msg.Type, address, cache_entry.ValidBlocks);
+            cache_entry.UsedBlocks.at(getRegionOffset(address)) := true;
+        }
+        assert(cache_entry.NumValidBlocks <= blocksPerRegion);
+        assert(cache_entry.NumValidBlocks >= 0);
+        APPEND_TRANSITION_COMMENT(" valid blocks ");
+        APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+      } else {
+        error("This shouldn't happen anymore I think");
+        //tbe.ValidBlocks.at(getRegionOffest(address)) := true;
+        assert(getState(tbe, cache_entry, address) == State:P_NP);
+      }
+    }
+  }
+
+  action(uw_updatePossibleWriteback, "uw", desc="writeback request complete") {
+      peek(unblockNetwork_in, UnblockMsg) {
+          if (is_valid(cache_entry) && in_msg.validToInvalid &&
+              cache_entry.clearOnDone && cache_entry.clearOnDoneAddr == address) {
+              DPRINTF(RubySlicc, "I have no idea what is going on here\n");
+              cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+              cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+              cache_entry.clearOnDone := false;
+          }
+      }
+  }
+
+
+  action(rp_requestPrivate, "rp", desc="Send private request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.DemandAddress := address;
+              out_msg.Type := CoherenceRequestType:PrivateRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+              DPRINTF(RubySlicc, "Private request %s\n", out_msg);
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(ru_requestUpgrade, "ru", desc="Send upgrade request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.Type := CoherenceRequestType:UpgradeRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(rw_requestWriteback, "rq", desc="Send writeback request") {
+    // No need to send acks on replacements
+    enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address); // use the actual address so the demand request can be fulfilled
+        out_msg.Type := CoherenceRequestType:CleanWbRequest;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.Dirty := tbe.dirty;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    }
+  }
+
+  action(rs_requestShared, "rs", desc="Send shared request r-dir") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          // No need to send acks on replacements
+          assert(is_invalid(tbe));
+          enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+              out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+              out_msg.Type := CoherenceRequestType:SharedRequest;
+              out_msg.OriginalType := in_msg.Type;
+              out_msg.Requestor := machineID;
+              out_msg.WTRequestor := in_msg.WTRequestor;
+              out_msg.InitialRequestTime := curCycle();
+              // will this always be ok? probably not for multisocket
+              out_msg.Destination.add(map_Address_to_RegionDir(address));
+              out_msg.MessageSize := MessageSizeType:Request_Control;
+          }
+          cache_entry.ProbeRequestTime := curCycle();
+          cache_entry.MsgSentToDir := true;
+          APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      }
+  }
+
+  action(ai_ackRegionInv, "ai", desc="Send ack to r-dir on region inv if tbe says so") {
+    // No need to send acks on replacements
+    assert(is_valid(tbe));
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceResponseType:CPUPrbResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ad_ackDircetory, "ad", desc="send probe response to directory") {
+    if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { //VIPER tcc doesnt understand PrbShrData
+      assert(tbe.DemandRequest);                                    //So, let RegionBuffer take care of sending back ack
+      enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+          out_msg.addr := tbe.DemandAddress;
+          out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+          out_msg.Sender := getPeer(machineID,address);
+          out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+          out_msg.Dirty := false;  // only true if sending back data i think
+          out_msg.Hit := false;
+          out_msg.Ntsl := false;
+          out_msg.State := CoherenceState:NA;
+          out_msg.NoAckNeeded := true;
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
+  action(aie_ackRegionExclusiveInv, "aie", desc="Send ack to r-dir on region inv if tbe says so") {
+    // No need to send acks on replacements
+    assert(is_valid(tbe));
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceResponseType:CPUPrbResp;
+        out_msg.Sender := machineID;
+        out_msg.NotCached := true;
+        out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := tbe.dirty;
+    }
+  }
+
+  action(ain_ackRegionInvNow, "ain", desc="Send ack to r-dir on region inv") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(aine_ackRegionInvExlusiveNow, "aine", desc="Send ack to r-dir on region inv with exlusive permission") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;
+      out_msg.Sender := machineID;
+      out_msg.NotCached := true;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(ap_ackPrivateNotify, "ap", desc="Send ack to r-dir on private notify") {
+    enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceResponseType:PrivateAck;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+      out_msg.MessageSize := MessageSizeType:Response_Control;
+    }
+  }
+
+  action(aw_ackWbNotify, "aw", desc="Send ack to r-dir on writeback notify") {
+    peek(notifyNetwork_in, CPURequestMsg) {
+      if (in_msg.NoAckNeeded == false) {
+        enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+          out_msg.addr := getRegionBase(address);
+          out_msg.Type := CoherenceResponseType:RegionWbAck;
+          out_msg.Sender := machineID;
+          out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+          out_msg.MessageSize := MessageSizeType:Response_Control;
+        }
+      }
+    }
+  }
+
+  action(e_evictCurrent, "e", desc="Evict this block in the region") {
+    // send force invalidate message to directory to invalidate this block
+    // must invalidate all blocks since region buffer could have privitized it
+      if (tbe.ValidBlocks.at(getRegionOffset(address)) &&
+          (tbe.DemandRequest == false || tbe.DemandAddress != address)) {
+          DPRINTF(RubySlicc, "trying to evict address %s (base: %s, offset: %d)\n", address, getRegionBase(address), getRegionOffset(address));
+          DPRINTF(RubySlicc, "tbe valid blocks %s\n", tbe.ValidBlocks);
+
+        enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := tbe.MsgType;
+            out_msg.ReturnData := true;
+            if (address == tbe.DemandAddress) {
+                out_msg.DemandRequest := true;
+            }
+            out_msg.MessageSize := MessageSizeType:Control;
+            out_msg.Destination.add(getPeer(machineID,address));
+            DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        APPEND_TRANSITION_COMMENT(" current ");
+        APPEND_TRANSITION_COMMENT(tbe.ValidBlocks.at(getRegionOffset(address)));
+        tbe.AllAcksReceived := false;
+      } else {
+          DPRINTF(RubySlicc, "Not evicting demand %s\n", address);
+      }
+  }
+
+  action(ed_evictDemand, "ed", desc="Evict the demand request if it's valid") {
+    if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) {
+      tbe.OutstandingAcks := 0;
+      tbe.AllAcksReceived := true;
+      tbe.DoneEvicting := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:AcksComplete;
+          out_msg.addr := getRegionBase(address);
+      }
+    } else if (tbe.DemandRequest) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+        out_msg.addr := tbe.DemandAddress;
+        out_msg.Type := tbe.MsgType;
+        out_msg.ReturnData := true;
+        out_msg.DemandRequest := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.add(getPeer(machineID,address));
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.AllAcksReceived := false;
+      }
+      if (tbe.ValidBlocks.at(getRegionOffset(tbe.DemandAddress)) == false) {
+          tbe.OutstandingAcks := tbe.OutstandingAcks + 1;
+      }
+      APPEND_TRANSITION_COMMENT("Evicting demand ");
+      APPEND_TRANSITION_COMMENT(tbe.DemandAddress);
+    }
+    APPEND_TRANSITION_COMMENT("waiting acks ");
+    APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+  }
+
+  action(adp_AckDemandProbe, "fp", desc="forward demand probe even if we know that the core is invalid") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        if (in_msg.DemandRequest) {
+            enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+                out_msg.addr := in_msg.DemandAddress;
+                out_msg.Type := CoherenceResponseType:CPUPrbResp;  // L3 and CPUs respond in same way to probes
+                out_msg.Sender := getPeer(machineID,address);
+                out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+                out_msg.Dirty := false;  // only true if sending back data i think
+                out_msg.Hit := false;
+                out_msg.Ntsl := false;
+                out_msg.State := CoherenceState:NA;
+                out_msg.NoAckNeeded := true;
+                out_msg.MessageSize := MessageSizeType:Response_Control;
+                DPRINTF(RubySlicc, "%s\n", out_msg);
+            }
+        }
+    }
+  }
+
+  action(en_enqueueNextEvict, "en", desc="Queue evict the next block in the region") {
+    // increment in_msg.addr by blockSize bytes and enqueue on triggerPort
+    // Only enqueue if the next address doesn't overrun the region bound
+    if (getRegionBase(getNextBlock(address)) == getRegionBase(address)) {
+        enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+            out_msg.Type := TriggerType:InvNext;
+            out_msg.addr := getNextBlock(address);
+        }
+    } else {
+        tbe.DoneEvicting := true;
+        DPRINTF(RubySlicc, "Done evicing region %s\n", getRegionBase(address));
+        DPRINTF(RubySlicc, "Waiting for %s acks\n", tbe.OutstandingAcks);
+        if (tbe.AllAcksReceived == true) {
+            enqueue(triggerQueue_out, TriggerMsg, 1) {
+                out_msg.Type := TriggerType:AcksComplete;
+                out_msg.addr := getRegionBase(address);
+            }
+        }
+    }
+  }
+
+  action(ef_enqueueFirstEvict, "ef", desc="Queue the first block in the region to be evicted") {
+    if (tbe.DoneEvicting == false) {
+      enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+          out_msg.Type := TriggerType:InvNext;
+          out_msg.addr := getRegionBase(address);
+      }
+    }
+  }
+
+  action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+      DPRINTF(RubySlicc, "received ack for %s reg: %s vec: %s pos: %d\n",
+              address, getRegionBase(address), tbe.ValidBlocks, getRegionOffset(address));
+      peek(unblockNetwork_in, UnblockMsg) {
+          //
+          // Note the tbe ValidBlock vec will be a conservative list of the
+          // valid blocks since the cache entry ValidBlock vec is set on the
+          // request
+          //
+          if (in_msg.wasValid) {
+              assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+          }
+      }
+      tbe.OutstandingAcks := tbe.OutstandingAcks - 1;
+      tbe.AcksReceived.at(getRegionOffset(address)) := true;
+      assert(tbe.OutstandingAcks >= 0);
+      if (tbe.OutstandingAcks == 0) {
+          tbe.AllAcksReceived := true;
+          if (tbe.DoneEvicting) {
+              enqueue(triggerQueue_out, TriggerMsg, 1) {
+                  out_msg.Type := TriggerType:AcksComplete;
+                  out_msg.addr := getRegionBase(address);
+              }
+          }
+      }
+
+      APPEND_TRANSITION_COMMENT(getRegionBase(address));
+      APPEND_TRANSITION_COMMENT(" Acks left receive ");
+      APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+  }
+
+  action(do_decrementOutstanding, "do", desc="Decrement outstanding requests") {
+    APPEND_TRANSITION_COMMENT(" decr outstanding ");
+    if (is_valid(cache_entry)) {
+      cache_entry.NumOutstandingReqs := cache_entry.NumOutstandingReqs - 1;
+      assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)));
+      cache_entry.OutstandingReqs.at(getRegionOffset(address)) := false;
+      assert(cache_entry.NumOutstandingReqs >= 0);
+      assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+      APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+    }
+    if (is_valid(tbe)) {
+      tbe.NumOutstandingReqs := tbe.NumOutstandingReqs - 1;
+      assert(tbe.OutstandingReqs.at(getRegionOffset(address)));
+      tbe.OutstandingReqs.at(getRegionOffset(address)) := false;
+      assert(tbe.NumOutstandingReqs >= 0);
+      assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+      APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+    }
+  }
+
+  action(co_checkOutstanding, "co", desc="check if there are no more outstanding requests") {
+    assert(is_valid(tbe));
+    if ((tbe.NumOutstandingReqs <= tbe.OutstandingThreshold) &&
+        (tbe.AllOutstandingTriggered == false)) {
+      APPEND_TRANSITION_COMMENT(" no more outstanding: ");
+      APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+      APPEND_TRANSITION_COMMENT(tbe.OutstandingThreshold);
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:AllOutstanding;
+          if (tbe.DemandRequest) {
+              out_msg.addr := tbe.DemandAddress;
+          } else {
+              out_msg.addr := getRegionBase(address);
+          }
+          DPRINTF(RubySlicc, "co enqueuing %s\n", out_msg);
+          tbe.AllOutstandingTriggered := true;
+      }
+    } else {
+      APPEND_TRANSITION_COMMENT(" still more outstanding ");
+    }
+  }
+
+  action(ro_resetAllOutstanding, "ro", desc="Reset all outstanding") {
+      tbe.AllOutstandingTriggered := false;
+  }
+
+  action(so_setOutstandingCheckOne, "so", desc="Check outstanding is waiting for 1, not 0") {
+    // Need this for S_P because one request is outstanding between here and r-dir
+    tbe.OutstandingThreshold := 1;
+  }
+
+  action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+    set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+    cache_entry.ValidBlocks.clear();
+    cache_entry.ValidBlocks.resize(blocksPerRegion);
+    cache_entry.UsedBlocks.clear();
+    cache_entry.UsedBlocks.resize(blocksPerRegion);
+    cache_entry.dirty := false;
+    cache_entry.NumOutstandingReqs := 0;
+    cache_entry.OutstandingReqs.clear();
+    cache_entry.OutstandingReqs.resize(blocksPerRegion);
+  }
+
+  action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+    cacheMemory.deallocate(getRegionBase(address));
+    unset_cache_entry();
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(getRegionBase(address));
+    set_tbe(getTBE(address));
+    tbe.OutstandingAcks := 0;
+    tbe.AllAcksReceived := true; // starts true since the region could be empty
+    tbe.DoneEvicting := false;
+    tbe.AcksReceived.clear();
+    tbe.AcksReceived.resize(blocksPerRegion);
+    tbe.SendAck := false;
+    tbe.OutstandingThreshold := 0;
+    if (is_valid(cache_entry)) {
+      tbe.NumOutstandingReqs := cache_entry.NumOutstandingReqs;
+      tbe.OutstandingReqs := cache_entry.OutstandingReqs;
+      assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+      tbe.dirty := cache_entry.dirty;
+      tbe.ValidBlocks := cache_entry.ValidBlocks;
+      tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks);
+      APPEND_TRANSITION_COMMENT(" tbe valid blocks ");
+      APPEND_TRANSITION_COMMENT(tbe.ValidBlocks);
+      APPEND_TRANSITION_COMMENT(" cache valid blocks ");
+      APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+    } else {
+      tbe.dirty := false;
+    }
+  }
+
+  action(m_markSendAck, "m", desc="Mark TBE that we need to ack at end") {
+    assert(is_valid(tbe));
+    tbe.SendAck := true;
+  }
+
+  action(db_markDirtyBit, "db", desc="Mark TBE dirty bit") {
+      peek(unblockNetwork_in, UnblockMsg) {
+          if (is_valid(tbe)) {
+              tbe.dirty := tbe.dirty || in_msg.Dirty;
+          }
+      }
+  }
+
+  action(dr_markDoneAckReceived, "dr", desc="Mark TBE that a done ack has been received") {
+    assert(is_valid(tbe));
+    tbe.DoneAckReceived := true;
+    tbe.DoneAckAddr := address;
+    APPEND_TRANSITION_COMMENT(" marking done ack on TBE ");
+  }
+
+  action(se_setTBE, "se", desc="Set msg type to evict") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        tbe.MsgType := in_msg.Type;
+        tbe.Requestor := in_msg.Requestor;
+        tbe.DemandAddress := in_msg.DemandAddress;
+        tbe.DemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(sne_setNewTBE, "sne", desc="Set msg type to evict") {
+    peek(probeNetwork_in, NBProbeRequestMsg) {
+        tbe.NewMsgType := in_msg.Type;
+        tbe.NewRequestor := in_msg.Requestor;
+        tbe.NewDemandAddress := in_msg.DemandAddress;
+        tbe.NewDemandRequest := in_msg.DemandRequest;
+    }
+  }
+
+  action(soe_setOldTBE, "soe", desc="Set msg type to evict") {
+    tbe.MsgType := tbe.NewMsgType;
+    tbe.Requestor := tbe.NewRequestor;
+    tbe.DemandAddress := tbe.NewDemandAddress;
+    tbe.DemandRequest := tbe.NewDemandRequest;
+    tbe.OutstandingAcks :=  countBoolVec(tbe.ValidBlocks);
+    tbe.AllAcksReceived := true; // starts true since the region could be empty
+    tbe.DoneEvicting := false;
+    tbe.AcksReceived.clear();
+    tbe.AcksReceived.resize(blocksPerRegion);
+    tbe.SendAck := false;
+  }
+
+  action(ser_setTBE, "ser", desc="Set msg type to evict repl") {
+    tbe.MsgType := ProbeRequestType:PrbInv;
+  }
+
+  action(md_setMustDowngrade, "md", desc="When permissions finally get here, must be shared") {
+    assert(is_valid(cache_entry));
+    cache_entry.MustDowngrade := true;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(getRegionBase(address));
+    unset_tbe();
+  }
+
+  action(p_popRequestQueue, "p", desc="Pop the request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pl_popUnblockQueue, "pl", desc="Pop the unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pn_popNotifyQueue, "pn", desc="Pop the notify queue") {
+    notifyNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pp_popProbeQueue, "pp", desc="Pop the probe queue") {
+    probeNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+    DPRINTF(RubySlicc, "Trigger Before Contents: %s\n", triggerQueue_in);
+    triggerQueue_in.dequeue(clockEdge());
+    DPRINTF(RubySlicc, "Trigger After Contents: %s\n", triggerQueue_in);
+  }
+
+  // Must always use wake all, since non-region address wait on region addresses
+  action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(zz_stallAndWaitRequestQueue, "\z", desc="recycle request queue") {
+    Addr regAddr := getRegionBase(address);
+    DPRINTF(RubySlicc, "Stalling address %s\n", regAddr);
+    stall_and_wait(requestNetwork_in, regAddr);
+  }
+
+  action(yy_stallAndWaitProbeQueue, "\y", desc="stall probe queue") {
+    Addr regAddr := getRegionBase(address);
+    stall_and_wait(probeNetwork_in, regAddr);
+  }
+
+  action(yyy_recycleProbeQueue, "\yy", desc="recycle probe queue") {
+    probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(zzz_recycleRequestQueue, "\zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(www_recycleUnblockNetwork, "\ww", desc="recycle unblock queue") {
+    unblockNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(z_stall, "z", desc="stall request queue") {
+    // fake state
+  }
+
+  action(mru_setMRU, "mru", desc="set MRU") {
+    cacheMemory.setMRU(address, cache_entry.NumValidBlocks);
+  }
+
+  // Transitions
+
+  transition({NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, P_NP_W, P_NP_NP, NP_W}, {CPURead, CPUWriteback, CPUWrite}) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition(SS_P, {CPURead, CPUWriteback}) {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({NP, S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, NP_W, P_NP_NP}, StallAccess) {} {
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition({S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, P_NP_W, P_NP_NP, NP_W}, StallDoneAck) {
+    www_recycleUnblockNetwork;
+  }
+
+  transition(NP, StallDoneAck, NP_W) {
+    t_allocateTBE;
+    db_markDirtyBit;
+    dr_markDoneAckReceived;
+    pl_popUnblockQueue;
+  }
+
+  transition(NP_W, StaleRequest, NP) {
+    f_fwdReqToDir;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    p_popRequestQueue;
+  }
+
+  transition(P_NP_O, DowngradeRegion) {} {
+    z_stall; // should stall and wait
+  }
+
+  transition({NP_PS, S_NP_PS, S_P, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P}, ReplRegion) {} {
+    zz_stallAndWaitRequestQueue; // can't let things get out of order!
+  }
+
+  transition({P_NP_O, S_O, SS_P}, InvRegion) {} {
+    yyy_recycleProbeQueue; // can't be z_stall because there could be a RdBlkM in the requestQueue which has the sinked flag which is blocking the inv
+  }
+
+  transition(P_NP, {InvRegion, DowngradeRegion}, P_NP_NP) {} {
+    sne_setNewTBE;
+    pp_popProbeQueue;
+  }
+
+  transition(S_P, DowngradeRegion) {} {
+    adp_AckDemandProbe;
+    ain_ackRegionInvNow;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_W, InvRegion) {
+    adp_AckDemandProbe;
+    ain_ackRegionInvNow;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_W, DowngradeRegion) {
+    adp_AckDemandProbe;
+    aine_ackRegionInvExlusiveNow;
+    pp_popProbeQueue;
+  }
+
+  transition({P, S}, {CPURead, CPUWriteback}) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    f_fwdReqToDir;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(P, CPUWrite) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    f_fwdReqToDir;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(S, CPUWrite, S_O) {TagArrayRead} {
+    mru_setMRU;
+    t_allocateTBE;
+    co_checkOutstanding;
+    zz_stallAndWaitRequestQueue;
+  }
+
+  transition(S_O, AllOutstanding, SS_P) {
+    wa_wakeUpAllDependents;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(SS_P, CPUWrite, S_P) {
+    mru_setMRU;
+    dt_deallocateTBE;
+    ru_requestUpgrade;
+    u_updateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(NP, {CPURead, CPUWriteback}, NP_PS) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    rs_requestShared;
+    u_updateRegionEntry;
+    p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+  }
+
+  transition(NP, CPUWrite, NP_PS) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    rp_requestPrivate;
+    u_updateRegionEntry;
+    p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+  }
+
+  transition(NP_PS, PrivateNotify, P) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(S_P, PrivateNotify, P) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(NP_PS, SharedNotify, S) {} {
+    ap_ackPrivateNotify;
+    wa_wakeUpAllDependents;
+    pn_popNotifyQueue;
+  }
+
+  transition(P_NP_W, WbNotify, NP) {} {
+    aw_ackWbNotify;
+    wa_wakeUpAllDependents;
+    dt_deallocateTBE;
+    pn_popNotifyQueue;
+  }
+
+  transition({P, S}, ReplRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ser_setTBE;
+    d_deallocateRegionEntry;
+    co_checkOutstanding;
+  }
+
+  transition({P, S}, InvRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    d_deallocateRegionEntry;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(P_NP_O, AllOutstanding, P_NP) {} {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_P, InvRegion, S_NP_PS_O) {TagArrayRead} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    so_setOutstandingCheckOne;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(S_NP_PS_O, AllOutstanding, S_NP_PS) {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition(P, DowngradeRegion, P_S_O) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    se_setTBE;
+    m_markSendAck;
+    co_checkOutstanding;
+    pp_popProbeQueue;
+  }
+
+  transition(P_S_O, AllOutstanding, P_S) {} {
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    ro_resetAllOutstanding;
+    pt_popTriggerQueue;
+  }
+
+  transition({P, S}, DoneAck) {TagArrayWrite} {
+    do_decrementOutstanding;
+    wa_wakeUpAllDependents;
+    db_markDirtyBit;
+    uw_updatePossibleWriteback;
+    pl_popUnblockQueue;
+  }
+
+  transition({S_P, NP_PS, S_NP_PS}, DoneAck) {TagArrayWrite} {
+      www_recycleUnblockNetwork;
+  }
+
+  transition({P_NP_O, S_NP_PS_O, P_S_O, S_O}, DoneAck) {} {
+    do_decrementOutstanding;
+    co_checkOutstanding;
+    db_markDirtyBit;
+    uw_updatePossibleWriteback;
+    pl_popUnblockQueue;
+  }
+
+  transition({P_NP, P_S, S_NP_PS, P_NP_NP}, Evict) {} {
+    e_evictCurrent;
+    en_enqueueNextEvict;
+    pt_popTriggerQueue;
+  }
+
+  transition({P_NP, P_S, S_NP_PS, P_NP_NP}, InvAck) {} {
+    ra_receiveAck;
+    db_markDirtyBit;
+    pl_popUnblockQueue;
+  }
+
+  transition(P_NP, LastAck_CleanWb, P_NP_W) {} {
+      rw_requestWriteback;
+      pt_popTriggerQueue;
+  }
+
+  transition(P_NP_NP, LastAck_CleanWb, P_NP) {} {
+    soe_setOldTBE;
+    m_markSendAck;
+    ed_evictDemand;
+    ef_enqueueFirstEvict;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_NP, LastAck_PrbResp, NP) {} {
+    aie_ackRegionExclusiveInv;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(S_NP_PS, LastAck_PrbResp, NP_PS) {} {
+    aie_ackRegionExclusiveInv;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_S, LastAck_PrbResp, S) {} {
+    ai_ackRegionInv;
+    ad_ackDircetory;
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+}
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
new file mode 100644
index 000000000..b392311c5
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+  NodeID cpuRegionBufferNum;
+  NodeID gpuRegionBufferNum;
+  int blocksPerRegion := 64; // 4k regions
+  Cycles toDirLatency := 10;    // Latency to fwd requests and send invs to directory
+  bool always_migrate := "False";
+  bool sym_migrate := "False";
+  bool asym_migrate := "False";
+  bool noTCCdir := "False";
+  int TCC_select_num_bits := 1;
+
+  // To the directory
+  MessageBuffer * requestToDir, network="To", virtual_network="5", vnet_type="request";
+
+  // To the region buffers
+  MessageBuffer * notifyToRBuffer, network="To", virtual_network="7", vnet_type="request";
+  MessageBuffer * probeToRBuffer, network="To", virtual_network="8", vnet_type="request";
+
+  // From the region buffers
+  MessageBuffer * responseFromRBuffer, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * requestFromRegBuf, network="From", virtual_network="0", vnet_type="request";
+
+  MessageBuffer * triggerQueue;
+{
+
+  // States
+  state_declaration(State, desc="Region states", default="RegionDir_State_NP") {
+    NP, AccessPermission:Invalid,       desc="Not present in region directory";
+    P,  AccessPermission:Invalid,       desc="Region is private to owner";
+    S,  AccessPermission:Invalid,       desc="Region is shared between CPU and GPU";
+
+    P_NP,  AccessPermission:Invalid,    desc="Evicting the region";
+    NP_P,  AccessPermission:Invalid,    desc="Must wait for ack from R-buf";
+    NP_S,  AccessPermission:Invalid,    desc="Must wait for ack from R-buf";
+    P_P,   AccessPermission:Invalid,    desc="Waiting for ack from R-buf";
+    S_S,   AccessPermission:Invalid,    desc="Waiting for ack from R-buf";
+    P_S,   AccessPermission:Invalid,    desc="Downgrading the region";
+    S_P,   AccessPermission:Invalid,    desc="Upgrading the region";
+    P_AS,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+    S_AP,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+    P_AP,  AccessPermission:Invalid,    desc="Sent invalidates, waiting for acks";
+
+    SP_NP_W, AccessPermission:Invalid,   desc="Last sharer writing back, waiting for ack";
+    S_W,   AccessPermission:Invalid,   desc="Sharer writing back, waiting for ack";
+
+    P_AP_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+    P_AS_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+    S_AP_W, AccessPermission:Invalid,   desc="Fwded request to dir, waiting for ack";
+  }
+
+  enumeration(Event, desc="Region directory events") {
+    SendInv,        desc="Send inv message to any machine that has a region buffer";
+    SendUpgrade,    desc="Send upgrade message to any machine that has a region buffer";
+    SendDowngrade,  desc="Send downgrade message to any machine that has a region buffer";
+
+    Evict,          desc="Evict this region";
+
+    UpgradeRequest, desc="Request from r-buf for an upgrade";
+    SharedRequest,  desc="Request from r-buf for read";
+    PrivateRequest, desc="Request from r-buf for write";
+
+    InvAckCore,     desc="Ack from region buffer to order the invalidate";
+    InvAckCoreNoShare,     desc="Ack from region buffer to order the invalidate, and it does not have the region";
+    CPUPrivateAck,  desc="Ack from region buffer to order private notification";
+
+    LastAck,      desc="Done eviciting all the blocks";
+
+    StaleCleanWbRequest, desc="stale clean writeback reqeust";
+    StaleCleanWbRequestNoShare, desc="stale clean wb req from a cache which should be removed from sharers";
+    CleanWbRequest, desc="clean writeback reqeust, multiple sharers";
+    CleanWbRequest_LastSharer, desc="clean writeback reqeust, last sharer";
+    WritebackAck,   desc="Writeback Ack from region buffer";
+    DirReadyAck,   desc="Directory is ready, waiting Ack from region buffer";
+
+    TriggerInv,   desc="trigger invalidate message";
+    TriggerDowngrade, desc="trigger downgrade message";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    DataArrayRead,    desc="Read the data array";
+    DataArrayWrite,   desc="Write the data array";
+    TagArrayRead,     desc="Read the data array";
+    TagArrayWrite,    desc="Write the data array";
+  }
+
+  structure(BoolVec, external="yes") {
+    bool at(int);
+    void resize(int);
+    void clear();
+  }
+
+  structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+    Addr addr,        desc="Base address of this region";
+    NetDest Sharers,        desc="Set of machines that are sharing, but not owners";
+    State RegionState,      desc="Region state";
+    DataBlock DataBlk,      desc="Data for the block (always empty in region dir)";
+    MachineID Owner,        desc="Machine which owns all blocks in this region";
+    Cycles ProbeStart,        desc="Time when the first probe request was issued";
+    bool LastWriten, default="false", desc="The last time someone accessed this region, it wrote it";
+    bool LastWritenByCpu, default="false", desc="The last time the CPU accessed this region, it wrote it";
+    bool LastWritenByGpu, default="false", desc="The last time the GPU accessed this region, it wrote it";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,         desc="Transient state";
+    MachineID Owner,        desc="Machine which owns all blocks in this region";
+    NetDest Sharers,        desc="Set of machines to send evicts";
+    int NumValidBlocks,     desc="Number of blocks valid so we don't have to count a BoolVec";
+    bool AllAcksReceived,   desc="Got all necessary acks from dir";
+    CoherenceRequestType MsgType, desc="Msg type for the evicts could be inv or dwngrd";
+    Cycles ProbeRequestTime, default="Cycles(0)", desc="Start of probe request";
+    Cycles InitialRequestTime, default="Cycles(0)", desc="To forward back on out msg";
+    Addr DemandAddress, desc="Demand address from original request";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  // Stores only region addresses
+  TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs";
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_cache_entry(AbstractCacheEntry b);
+  void unset_cache_entry();
+  void set_tbe(TBE b);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  int blockBits,  default="RubySystem::getBlockSizeBits()";
+  int blockBytes, default="RubySystem::getBlockSizeBytes()";
+  int regionBits, default="log2(m_blocksPerRegion)";
+
+  // Functions
+
+  MachineID getCoreMachine(MachineID rBuf, Addr address) {
+    if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+      return createMachineID(MachineType:CorePair, intToID(0));
+    } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+      if (noTCCdir) {
+        return mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits);
+      } else {
+        return createMachineID(MachineType:TCCdir, intToID(0));
+      }
+    } else {
+      error("Unexpected region buffer number");
+    }
+  }
+
+  bool isCpuMachine(MachineID rBuf) {
+    if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+      return true;
+    } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+      return false;
+    } else {
+      error("Unexpected region buffer number");
+    }
+  }
+
+  bool symMigrate(Entry cache_entry) {
+      return cache_entry.LastWriten;
+  }
+
+  bool asymMigrate(Entry cache_entry, MachineID requestor) {
+      if (isCpuMachine(requestor)) {
+          return cache_entry.LastWritenByCpu;
+      } else {
+          return cache_entry.LastWritenByGpu;
+      }
+  }
+
+  int getRegionOffset(Addr addr) {
+    if (blocksPerRegion > 1) {
+      Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+      int ret := addressToInt(offset);
+      assert(ret < blocksPerRegion);
+      return ret;
+    } else {
+      return 0;
+    }
+  }
+
+  Addr getRegionBase(Addr addr) {
+    return maskLowOrderBits(addr, blockBits+regionBits);
+  }
+
+  Addr getNextBlock(Addr addr) {
+    Addr a := addr;
+    makeNextStrideAddress(a, 1);
+    return a;
+  }
+
+  bool presentOrAvail(Addr addr) {
+    DPRINTF(RubySlicc, "Present? %s, avail? %s\n", cacheMemory.isTagPresent(getRegionBase(addr)), cacheMemory.cacheAvail(getRegionBase(addr)));
+    return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+  }
+
+  // Returns a region entry!
+  Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+    return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+  }
+
+  TBE getTBE(Addr addr), return_by_pointer="yes" {
+    return TBEs.lookup(getRegionBase(addr));
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    return getCacheEntry(getRegionBase(addr)).DataBlk;
+  }
+
+  State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    if (is_valid(tbe)) {
+      return tbe.TBEState;
+    } else if (is_valid(cache_entry)) {
+      return cache_entry.RegionState;
+    }
+    return State:NP;
+  }
+
+  void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+    if (is_valid(tbe)) {
+        tbe.TBEState := state;
+    }
+    if (is_valid(cache_entry)) {
+        cache_entry.RegionState := state;
+    }
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    TBE tbe := getTBE(addr);
+    if(is_valid(tbe)) {
+      return RegionDir_State_to_permission(tbe.TBEState);
+    }
+    Entry cache_entry := getCacheEntry(addr);
+    if(is_valid(cache_entry)) {
+      return RegionDir_State_to_permission(cache_entry.RegionState);
+    }
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+    if (is_valid(cache_entry)) {
+      cache_entry.changePermission(RegionDir_State_to_permission(state));
+    }
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    functionalMemoryRead(pkt);
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    if (functionalMemoryWrite(pkt)) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      cacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      cacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:DataArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:DataArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:TagArrayRead) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:TagArrayWrite) {
+      return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+  out_port(requestNetwork_out, CPURequestMsg, requestToDir);
+  out_port(notifyNetwork_out, CPURequestMsg, notifyToRBuffer);
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToRBuffer);
+
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=2) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        assert(in_msg.addr == getRegionBase(in_msg.addr));
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        TBE tbe := getTBE(in_msg.addr);
+        DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          assert(is_valid(tbe));
+          trigger(Event:LastAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:InvRegion) {
+          assert(is_valid(tbe));
+          trigger(Event:TriggerInv, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == TriggerType:DowngradeRegion) {
+          assert(is_valid(tbe));
+          trigger(Event:TriggerDowngrade, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unknown trigger message");
+        }
+      }
+    }
+  }
+
+  in_port(responseNetwork_in, ResponseMsg, responseFromRBuffer, rank=1) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := getTBE(in_msg.addr);
+        Entry cache_entry := getCacheEntry(in_msg.addr);
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          assert(is_valid(tbe));
+          if (in_msg.NotCached) {
+            trigger(Event:InvAckCoreNoShare, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:InvAckCore, in_msg.addr, cache_entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceResponseType:PrivateAck) {
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          assert(is_valid(cache_entry));
+          //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender));
+          trigger(Event:CPUPrivateAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:RegionWbAck) {
+            //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender) == false);
+          assert(in_msg.addr == getRegionBase(in_msg.addr));
+          trigger(Event:WritebackAck, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:DirReadyAck) {
+            assert(is_valid(tbe));
+            trigger(Event:DirReadyAck, getRegionBase(in_msg.addr), cache_entry, tbe);
+        } else {
+          error("Invalid response type");
+        }
+      }
+    }
+  }
+
+  // In from cores
+  // NOTE: We get the cache / TBE entry based on the region address,
+  //       but pass the block address to the actions
+  in_port(requestNetwork_in, CPURequestMsg, requestFromRegBuf, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        //assert(in_msg.addr == getRegionBase(in_msg.addr));
+        Addr address := getRegionBase(in_msg.addr);
+        DPRINTF(RubySlicc, "Got %s, base %s\n", in_msg.addr, address);
+        if (presentOrAvail(address)) {
+          TBE tbe := getTBE(address);
+          Entry cache_entry := getCacheEntry(address);
+          if (in_msg.Type == CoherenceRequestType:PrivateRequest) {
+            if (is_valid(cache_entry) && (cache_entry.Owner != in_msg.Requestor ||
+                getState(tbe, cache_entry, address) == State:S)) {
+              trigger(Event:SendInv, address, cache_entry, tbe);
+            } else {
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:SharedRequest) {
+            if (is_invalid(cache_entry)) {
+              // If no one has ever requested this region give private permissions
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            } else {
+                if (always_migrate ||
+                    (sym_migrate && symMigrate(cache_entry)) ||
+                    (asym_migrate && asymMigrate(cache_entry, in_msg.Requestor))) {
+                    if (cache_entry.Sharers.count() == 1 &&
+                        cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                        trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+                    } else {
+                        trigger(Event:SendInv, address, cache_entry, tbe);
+                    }
+                } else { // don't migrate
+                    if(cache_entry.Sharers.isElement(in_msg.Requestor) ||
+                       getState(tbe, cache_entry, address) == State:S) {
+                        trigger(Event:SharedRequest, address, cache_entry, tbe);
+                    } else {
+                        trigger(Event:SendDowngrade, address, cache_entry, tbe);
+                    }
+                }
+            }
+          } else if (in_msg.Type == CoherenceRequestType:UpgradeRequest) {
+            if (is_invalid(cache_entry)) {
+              trigger(Event:PrivateRequest, address, cache_entry, tbe);
+            } else if (cache_entry.Sharers.count() == 1 && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+              trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+            } else {
+              trigger(Event:SendUpgrade, address, cache_entry, tbe);
+            }
+          } else if (in_msg.Type == CoherenceRequestType:CleanWbRequest) {
+            if (is_invalid(cache_entry) || cache_entry.Sharers.isElement(in_msg.Requestor) == false) {
+              trigger(Event:StaleCleanWbRequest, address, cache_entry, tbe);
+            } else {
+                DPRINTF(RubySlicc, "wb address %s(%s) owner %s sharers %s requestor %s %d %d\n", in_msg.addr, getRegionBase(in_msg.addr), cache_entry.Owner, cache_entry.Sharers, in_msg.Requestor, cache_entry.Sharers.isElement(in_msg.Requestor), cache_entry.Sharers.count());
+                if (cache_entry.Sharers.isElement(in_msg.Requestor) && cache_entry.Sharers.count() == 1) {
+                    DPRINTF(RubySlicc, "last wb\n");
+                    trigger(Event:CleanWbRequest_LastSharer, address, cache_entry, tbe);
+                } else {
+                    DPRINTF(RubySlicc, "clean wb\n");
+                    trigger(Event:CleanWbRequest, address, cache_entry, tbe);
+                }
+            }
+          } else {
+            error("unknown region dir request type");
+          }
+        } else {
+          Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+          TBE victim_tbe := getTBE(victim);
+          Entry victim_entry := getCacheEntry(victim);
+          DPRINTF(RubySlicc, "Evicting address %s for new region at address %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+          assert(is_valid(victim_entry));
+          trigger(Event:Evict, victim, victim_entry, victim_tbe);
+        }
+      }
+    }
+  }
+
+  // Actions
+
+  action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;  // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+            out_msg.Acks := cache_entry.Sharers.count();
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirShared, "fs", desc="Forward CPU request to directory (shared)") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr;  // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := true;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        out_msg.ForceShared := true;
+        if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+            out_msg.Acks := cache_entry.Sharers.count();
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirWithAck, "fa", desc="Forward CPU request to directory with ack request") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := false;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        if (is_valid(cache_entry)) {
+            out_msg.Acks := cache_entry.Sharers.count();
+            // Don't need an ack from the requestor!
+            if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                out_msg.Acks := out_msg.Acks - 1;
+            }
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(f_fwdReqToDirWithAckShared, "fas", desc="Forward CPU request to directory with ack request") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+        out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+        out_msg.Type := in_msg.OriginalType;
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := in_msg.Dirty;
+        out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+        out_msg.Shared := in_msg.Shared;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.Private := in_msg.Private;
+        out_msg.NoAckNeeded := false;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ProbeRequestStartTime := curCycle();
+        out_msg.DemandRequest := true;
+        out_msg.ForceShared := true;
+        if (is_valid(cache_entry)) {
+            out_msg.Acks := cache_entry.Sharers.count();
+            // Don't need an ack from the requestor!
+            if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+                out_msg.Acks := out_msg.Acks - 1;
+            }
+        } else {
+            out_msg.Acks := 0;
+        }
+      }
+    }
+  }
+
+  action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+    set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+    peek(requestNetwork_in, CPURequestMsg) {
+      APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+    }
+  }
+
+  action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+    cacheMemory.deallocate(getRegionBase(address));
+    unset_cache_entry();
+  }
+
+  action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+    //assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+    DPRINTF(RubySlicc, "received ack for %s reg: %s\n", address, getRegionBase(address));
+    tbe.NumValidBlocks := tbe.NumValidBlocks - 1;
+    assert(tbe.NumValidBlocks >= 0);
+    if (tbe.NumValidBlocks == 0) {
+      tbe.AllAcksReceived := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:AcksComplete;
+        out_msg.addr := address;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" Acks left receive ");
+    APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+  }
+
+  action(ca_checkAcks, "ca", desc="Check to see if we need more acks") {
+    if (tbe.NumValidBlocks == 0) {
+      tbe.AllAcksReceived := true;
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:AcksComplete;
+        out_msg.addr := address;
+      }
+    }
+  }
+
+  action(ti_triggerInv, "ti", desc="") {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.Type := TriggerType:InvRegion;
+          out_msg.addr := address;
+      }
+  }
+
+  action(td_triggerDowngrade, "td", desc="") {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.Type := TriggerType:DowngradeRegion;
+        out_msg.addr := address;
+      }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(getRegionBase(address));
+    set_tbe(getTBE(address));
+    if (is_valid(cache_entry)) {
+      tbe.Owner := cache_entry.Owner;
+      tbe.Sharers := cache_entry.Sharers;
+      tbe.AllAcksReceived := true; // assume no acks are required
+    }
+    tbe.ProbeRequestTime := curCycle();
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.DemandAddress := in_msg.addr;
+    }
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" Acks left ");
+    APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+    APPEND_TRANSITION_COMMENT(" Owner, ");
+    APPEND_TRANSITION_COMMENT(tbe.Owner);
+    APPEND_TRANSITION_COMMENT(" sharers, ");
+    APPEND_TRANSITION_COMMENT(tbe.Sharers);
+  }
+
+  action(ss_setSharers, "ss", desc="Add requestor to sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+        cache_entry.Sharers.add(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(rs_removeSharer, "rs", desc="Remove requestor to sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+        cache_entry.Sharers.remove(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" removing ");
+        APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+        APPEND_TRANSITION_COMMENT(" sharers ");
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(rsr_removeSharerResponse, "rsr", desc="Remove requestor to sharers") {
+    peek(responseNetwork_in, ResponseMsg) {
+        cache_entry.Sharers.remove(in_msg.Sender);
+        APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+    }
+  }
+
+  action(cs_clearSharers, "cs", desc="Add requestor to sharers") {
+    cache_entry.Sharers.clear();
+  }
+
+  action(so_setOwner, "so", desc="Set the owner to the requestor") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      cache_entry.Owner := in_msg.Requestor;
+      APPEND_TRANSITION_COMMENT(" Owner now: ");
+      APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+    }
+  }
+
+  action(rr_removeRequestorFromTBE, "rr", desc="Remove requestor from TBE sharers") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.Sharers.remove(in_msg.Requestor);
+    }
+  }
+
+  action(ur_updateDirtyStatusOnRequest, "ur", desc="Update dirty status on demand request") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          if (is_valid(cache_entry)) {
+              if ((in_msg.Type == CoherenceRequestType:SharedRequest) &&
+                  (cache_entry.Sharers.isElement(in_msg.Requestor) == false)) {
+                  cache_entry.LastWriten := false;
+                  if (isCpuMachine(in_msg.Requestor)) {
+                      cache_entry.LastWritenByCpu := false;
+                  } else {
+                      cache_entry.LastWritenByGpu := false;
+                  }
+              } else if ((in_msg.Type == CoherenceRequestType:PrivateRequest) ||
+                         (in_msg.Type == CoherenceRequestType:UpgradeRequest)) {
+                  cache_entry.LastWriten := true;
+                  if (isCpuMachine(in_msg.Requestor)) {
+                      cache_entry.LastWritenByCpu := true;
+                  } else {
+                      cache_entry.LastWritenByGpu := true;
+                  }
+              }
+          }
+      }
+  }
+
+  action(ud_updateDirtyStatusWithWb, "ud", desc="Update dirty status on writeback") {
+      peek(requestNetwork_in, CPURequestMsg) {
+          if (is_valid(cache_entry) && in_msg.Dirty) {
+              cache_entry.LastWriten := true;
+              if (isCpuMachine(in_msg.Requestor)) {
+                  cache_entry.LastWritenByCpu := true;
+              } else {
+                  cache_entry.LastWritenByGpu := true;
+              }
+          }
+      }
+  }
+
+  action(sns_setNumAcksSharers, "sns", desc="Set number of acks to one per shared region buffer") {
+    assert(is_valid(tbe));
+    assert(is_valid(cache_entry));
+    tbe.NumValidBlocks := tbe.Sharers.count();
+  }
+
+  action(sno_setNumAcksOne, "sno", desc="Set number of acks to one per shared region buffer") {
+    assert(is_valid(tbe));
+    assert(is_valid(cache_entry));
+    tbe.NumValidBlocks := 1;
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    TBEs.deallocate(getRegionBase(address));
+    APPEND_TRANSITION_COMMENT(" reg: ");
+    APPEND_TRANSITION_COMMENT(getRegionBase(address));
+    unset_tbe();
+  }
+
+  action(wb_sendWbNotice, "wb", desc="Send notice to cache that writeback is acknowledged") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:WbNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(wbn_sendWbNoticeNoAck, "wbn", desc="Send notice to cache that writeback is acknowledged (no ack needed)") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:WbNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.NoAckNeeded := true;
+      }
+    }
+  }
+
+  action(b_sendPrivateNotice, "b", desc="Send notice to private cache that it has private access") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:PrivateNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(bs_sendSharedNotice, "bs", desc="Send notice to private cache that it has private access") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+        out_msg.addr := getRegionBase(address);
+        out_msg.Type := CoherenceRequestType:SharedNotify;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Requestor := machineID;
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+      }
+    }
+  }
+
+  action(c_sendSharedNoticeToOrigReq, "c", desc="Send notice to private cache that it has shared access") {
+    assert(is_valid(tbe));
+    enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceRequestType:SharedNotify;
+      out_msg.Destination.add(tbe.Owner);
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      APPEND_TRANSITION_COMMENT("dest: ");
+      APPEND_TRANSITION_COMMENT(out_msg.Destination);
+    }
+  }
+
+  action(sp_sendPrivateNoticeToOrigReq, "sp", desc="Send notice to private cache that it has private access") {
+    assert(is_valid(tbe));
+    enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+      out_msg.addr := getRegionBase(address);
+      out_msg.Type := CoherenceRequestType:PrivateNotify;
+      out_msg.Destination.add(tbe.Owner);
+      out_msg.Requestor := machineID;
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      APPEND_TRANSITION_COMMENT("dest: ");
+      APPEND_TRANSITION_COMMENT(out_msg.Destination);
+    }
+  }
+
+  action(i_RegionInvNotify, "i", desc="Send notice to private cache that it no longer has private access") {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          out_msg.DemandAddress := tbe.DemandAddress;
+          //out_msg.Requestor := tbe.Requestor;
+          out_msg.Requestor := machineID;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          //Fix me: assert(tbe.Sharers.count() > 0);
+          out_msg.DemandRequest := true;
+          out_msg.Destination := tbe.Sharers;
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          APPEND_TRANSITION_COMMENT("dest: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Destination);
+      }
+  }
+
+  action(i0_RegionInvNotifyDemand0, "i0", desc="Send notice to private cache that it no longer has private access") {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+          out_msg.addr := address;
+          // Demand address should default to 0 -> out_msg.DemandAddress := 0;
+          out_msg.Requestor := machineID;
+          out_msg.Type := ProbeRequestType:PrbInv;
+          out_msg.Destination := tbe.Sharers;
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          APPEND_TRANSITION_COMMENT("dest: ");
+          APPEND_TRANSITION_COMMENT(out_msg.Destination);
+      }
+  }
+
+  action(rd_RegionDowngrade, "rd", desc="Send notice to private cache that it only has shared access") {
+        enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+            out_msg.addr := address;
+            out_msg.DemandAddress := tbe.DemandAddress;
+            out_msg.Requestor := machineID;
+            out_msg.Type := ProbeRequestType:PrbDowngrade;
+            out_msg.DemandRequest := true;
+            out_msg.Destination := tbe.Sharers;
+            out_msg.MessageSize := MessageSizeType:Request_Control;
+            APPEND_TRANSITION_COMMENT("dest: ");
+            APPEND_TRANSITION_COMMENT(out_msg.Destination);
+        }
+  }
+
+  action(p_popRequestQueue, "p", desc="Pop the request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="Pop the response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(s_stallAndWaitRequest, "s", desc="Stall and wait on the region address") {
+    Addr regAddr := getRegionBase(address);
+    stall_and_wait(requestNetwork_in, regAddr);
+  }
+
+  action(w_wakeUpRegionDependents, "w", desc="Wake up any requests waiting for this region") {
+    wakeUpBuffers(getRegionBase(address));
+  }
+
+  action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(zz_recycleRequestQueue, "\z", desc="...") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(z_stall, "z", desc="stall request queue") {
+    // fake state
+  }
+
+  action(mru_setMRU, "mru", desc="set MRU") {
+    cacheMemory.setMRU(address);
+  }
+
+ // Transistions
+
+  transition({NP_P, P_P, NP_S, S_S, S_P, P_S, P_NP, S_AP, P_AS, P_AP, SP_NP_W, S_W, P_AP_W, P_AS_W, S_AP_W}, {PrivateRequest, SharedRequest, UpgradeRequest, SendInv, SendUpgrade, SendDowngrade, CleanWbRequest, CleanWbRequest_LastSharer, StaleCleanWbRequest}) {
+    s_stallAndWaitRequest
+  }
+
+  transition({NP_P, P_P, NP_S, S_S, S_P, S_W, P_S, P_NP, S_AP, P_AS, P_AP, P_AP_W, P_AS_W, S_AP_W}, Evict) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(NP, {PrivateRequest, SendUpgrade}, NP_P) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    so_setOwner;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(P, {PrivateRequest, UpgradeRequest}, P_P) {TagArrayRead} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({NP_P, P_P}, CPUPrivateAck, P) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({NP, P, S}, StaleCleanWbRequest) {TagArrayRead, TagArrayWrite} {
+      wbn_sendWbNoticeNoAck;
+      ud_updateDirtyStatusWithWb;
+      p_popRequestQueue;
+  }
+
+  transition(NP, SharedRequest, NP_S) {TagArrayRead, TagArrayWrite} {
+    a_allocateRegionEntry;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirShared;
+    bs_sendSharedNotice;
+    so_setOwner;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  // Could probably do this in parallel with other shared requests
+  transition(S, SharedRequest, S_S) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirShared;
+    bs_sendSharedNotice;
+    ss_setSharers;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition({P, S}, CleanWbRequest_LastSharer, SP_NP_W) {TagArrayRead, TagArrayWrite} {
+    ud_updateDirtyStatusWithWb;
+    wb_sendWbNotice;
+    rs_removeSharer;
+    t_allocateTBE;
+    d_deallocateRegionEntry;
+    p_popRequestQueue;
+  }
+
+  transition(S, CleanWbRequest, S_W) {TagArrayRead, TagArrayWrite} {
+    ud_updateDirtyStatusWithWb;
+    wb_sendWbNotice;
+    rs_removeSharer;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(SP_NP_W, WritebackAck, NP) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(S_W, WritebackAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({NP_S, S_S}, CPUPrivateAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(S, UpgradeRequest, S_P) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDir;
+    b_sendPrivateNotice;
+    so_setOwner;
+    t_allocateTBE;
+    p_popRequestQueue;
+  }
+
+  transition(S_P, CPUPrivateAck, P) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition(P, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    //i_RegionInvNotify;
+    p_popRequestQueue;
+  }
+
+  transition({P_AP_W, S_AP_W}, DirReadyAck) {
+      ti_triggerInv;
+      pr_popResponseQueue;
+  }
+
+  transition(P_AS_W, DirReadyAck) {
+      td_triggerDowngrade;
+      pr_popResponseQueue;
+  }
+
+  transition(P_AS_W, TriggerDowngrade, P_AS) {
+      rd_RegionDowngrade;
+      pt_popTriggerQueue;
+  }
+
+  transition(P_AP_W, TriggerInv, P_AP) {
+      i_RegionInvNotify;
+      pt_popTriggerQueue;
+  }
+
+  transition(S_AP_W, TriggerInv, S_AP) {
+      i_RegionInvNotify;
+      pt_popTriggerQueue;
+  }
+
+  transition(P, SendUpgrade, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(P, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    i0_RegionInvNotifyDemand0;
+    d_deallocateRegionEntry;
+  }
+
+  transition(S, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(S, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    i0_RegionInvNotifyDemand0;
+    d_deallocateRegionEntry;
+  }
+
+  transition(P_NP, LastAck, NP) {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(S, SendUpgrade, S_AP_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAck;
+    so_setOwner;
+    t_allocateTBE;
+    rr_removeRequestorFromTBE;
+    sns_setNumAcksSharers;
+    cs_clearSharers;
+    ss_setSharers;
+    p_popRequestQueue;
+  }
+
+  transition(S_AP, LastAck, S_P) {
+    sp_sendPrivateNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_AP, LastAck, P_P) {
+    sp_sendPrivateNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P, SendDowngrade, P_AS_W) {TagArrayRead, TagArrayWrite} {
+    mru_setMRU;
+    ur_updateDirtyStatusOnRequest;
+    f_fwdReqToDirWithAckShared;
+    so_setOwner;
+    t_allocateTBE;
+    sns_setNumAcksSharers;
+    ss_setSharers; //why do we set the sharers before sending the downgrade?  Are we sending a downgrade to the requestor?
+    p_popRequestQueue;
+  }
+
+  transition(P_AS, LastAck, P_S) {
+    c_sendSharedNoticeToOrigReq;
+    pt_popTriggerQueue;
+  }
+
+  transition(P_S, CPUPrivateAck, S) {
+    dt_deallocateTBE;
+    w_wakeUpRegionDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({P_NP, P_AS, S_AP, P_AP}, InvAckCore) {} {
+    ra_receiveAck;
+    pr_popResponseQueue;
+  }
+
+  transition({P_NP, S_AP, P_AP}, InvAckCoreNoShare) {} {
+    ra_receiveAck;
+    pr_popResponseQueue;
+  }
+
+  transition(P_AS, InvAckCoreNoShare) {} {
+    ra_receiveAck;
+    rsr_removeSharerResponse;
+    pr_popResponseQueue;
+  }
+
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-dir.sm b/src/mem/protocol/MOESI_AMD_Base-dir.sm
new file mode 100644
index 000000000..52cefda66
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-dir.sm
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  Cycles response_latency := 5;
+  Cycles l3_hit_latency := 50;
+  bool noTCCdir := "False";
+  bool CPUonly := "False";
+  int TCC_select_num_bits;
+  bool useL3OnWT := "False";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+  MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+  MessageBuffer * triggerQueue;
+  MessageBuffer * L3triggerQueue;
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it's possible for the data only to be in the network
+    // in the WB, L3 has sent it and gone on with its business in possibly I
+    // state.
+    BS_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,                 desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,                 desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,                desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,                desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,                  desc="sent response, Blocked til ack";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    CtoD,               desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,         desc="Notification that WB has been superceded by a probe";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="Unblock because of writethrough request finishing";
+
+    StaleVicDirty,        desc="Core invalidated before VicDirty processed";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+    WriteMask writeMask,    desc="outstanding write through mask";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes
+        + functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if (directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+        L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+        L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+        L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+        L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad request message type");
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }else{
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        if(tbe.atomicData){
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        out_msg.L3Hit := tbe.L3Hit;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      queueMemoryWrite(machineID, address, to_memory_controller_latency,
+                       in_msg.DataBlk);
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        if (tbe.Dirty == false) {
+          tbe.DataBlk := entry.DataBlk;
+        }
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+
+        // add relevant TCC node to list. This replaces all TCPs and SQCs
+        if (((in_msg.Type == CoherenceRequestType:WriteThrough ||
+              in_msg.Type == CoherenceRequestType:Atomic) &&
+             in_msg.NoWriteConflict) ||
+            CPUonly) {
+        } else if (noTCCdir) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        } else {
+	      out_msg.Destination.add(mapAddressToRange(address,
+                                                    MachineType:TCCdir,
+                            TCC_select_low_bit, TCC_select_num_bits));
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if (noTCCdir || CPUonly) {
+          //Don't need to notify TCC about reads
+        } else {
+	      out_msg.Destination.add(mapAddressToRange(address,
+                                                    MachineType:TCCdir,
+                            TCC_select_low_bit, TCC_select_num_bits));
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+        }
+        if (noTCCdir && !CPUonly) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if (noTCCdir && !CPUonly) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+        } else {
+            if (!noTCCdir) {
+                out_msg.Destination.add(mapAddressToRange(address,
+                                                          MachineType:TCCdir,
+                                                          TCC_select_low_bit,
+                                                          TCC_select_num_bits));
+            }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+        tbe.Dirty := true;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    if (tbe.Dirty == false) {
+        getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask);
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == false) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.wtData == true) {
+          // do nothing
+      } else if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+          tbe.writeMask.fillMask();
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(sf_setForwardReqTime, "sf", desc="...") {
+    tbe.ForwardRequestTime := curCycle();
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+      st_stallAndWaitRequest;
+  }
+
+  // It may be possible to save multiple invalidations here!
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
+      st_stallAndWaitRequest;
+  }
+
+
+  // transitions from U
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead} {
+    t_allocateTBE;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlk, B_PM) {L3TagArrayRead}{
+    t_allocateTBE;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    p_popRequestQueue;
+  }
+
+  transition(U, CtoD, BP) {L3TagArrayRead} {
+    t_allocateTBE;
+    ic_probeInvCore;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition(BL, {VicDirty, VicClean}) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) {
+    z_stall;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({B}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-msg.sm b/src/mem/protocol/MOESI_AMD_Base-msg.sm
new file mode 100644
index 000000000..ff8842369
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-msg.sm
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+  // CPU Request Types ONLY
+  RdBlk,        desc="Read Blk";
+  RdBlkM,       desc="Read Blk Modified";
+  RdBlkS,       desc="Read Blk Shared";
+  CtoD,         desc="Change To Dirty";
+  VicClean,     desc="L2 clean eviction";
+  VicDirty,     desc="L2 dirty eviction";
+  Atomic,       desc="Upper level atomic";
+  AtomicWriteBack, desc="Upper level atomic";
+  WriteThrough, desc="Ordered WriteThrough w/Data";
+  WriteThroughFifo, desc="WriteThrough with no data";
+  WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
+  WriteFlush,   desc="Release Flush";
+
+  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
+
+  WBApproval,   desc="WB Approval";
+
+  // Messages between Dir and R-Dir
+  ForceInv,     desc="Send invalide to the block";
+  ForceDowngrade, desc="Send downgrade to the block";
+  Unblock,      desc="Used to let the dir know a message has been sunk";
+
+  // Messages between R-Dir and R-Buffer
+  PrivateNotify, desc="Let region buffer know it has private access";
+  SharedNotify,  desc="Let region buffer know it has shared access";
+  WbNotify,      desc="Let region buffer know it saw its wb request";
+  Downgrade,     desc="Force the region buffer to downgrade to shared";
+  // Response to R-Dir (probably should be on a different network, but
+  // I need it to be ordered with respect to requests)
+  InvAck,       desc="Let the R-Dir know when the inv has occured";
+
+  PrivateRequest, desc="R-buf wants the region in private";
+  UpgradeRequest, desc="R-buf wants the region in private";
+  SharedRequest,  desc="R-buf wants the region in shared (could respond with private)";
+  CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+  NA,             desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+  PrbDowngrade,    desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbInv,       desc="Probe to Invalidate";
+
+  // For regions
+  PrbRepl,      desc="Force the cache to do a replacement";
+  PrbRegDowngrade, desc="Probe for Status";  // EtoS, MtoO, StoS
+  PrbAtomic,    desc="Forwarded Atomic Operation";
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+  NBSysResp,       desc="Northbridge response to CPU Rd request";
+  NBSysWBAck,      desc="Northbridge response ok to WB";
+  TDSysResp,       desc="TCCdirectory response to CPU Rd request";
+  TDSysWBAck,      desc="TCCdirectory response ok to WB";
+  TDSysWBNack,     desc="TCCdirectory response ok to drop";
+  CPUPrbResp,      desc="CPU Probe Response";
+  CPUData,         desc="CPU Data";
+  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
+  CPUCancelWB,     desc="want to cancel WB to Memory";
+  MemData,         desc="Data from Memory";
+
+  // for regions
+  PrivateAck,      desc="Ack that r-buf received private notify";
+  RegionWbAck,     desc="Writeback Ack that r-buf completed deallocation";
+  DirReadyAck,     desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+  Modified,             desc="Modified";
+  Owned,                desc="Owned state";
+  Exclusive,            desc="Exclusive";
+  Shared,               desc="Shared";
+  NA,                   desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  Addr DemandAddress,       desc="Physical block address for this request";
+  CoherenceRequestType Type,   desc="Type of request";
+  DataBlock DataBlk,           desc="data for the cache line";  // only for WB
+  bool Dirty,                   desc="whether WB data is dirty";  // only for WB
+  MachineID Requestor,            desc="Node who initiated the request";
+  NetDest Destination,             desc="Multicast destination mask";
+  bool Shared,                  desc="For CPU_WrVicBlk, vic is O not M.  For CPU_ClVicBlk, vic is S";
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  NetDest Sharers,              desc="Caches that may have a valid copy of the data";
+  bool ForceShared,             desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+  bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+  bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+  bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+  int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
+  WriteMask writeMask, desc="Write Through Data";
+  MachineID WTRequestor,            desc="Node who initiated the write through";
+  HSAScope scope,                      default="HSAScope_SYSTEM", desc="Request Scope";
+  int wfid,                         default="0", desc="wavefront id";
+  bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
+  int ProgramCounter,               desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceRequestType:VicDirty) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,             desc="NB_PrbNxtState signal";
+  bool ReturnData,              desc="Indicates CPU should return data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+  Addr DemandAddress,        desc="Demand block address for a region request";
+  MachineID Requestor,          desc="Requestor id for 3-hop requests";
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  int ProgramCounter,           desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  ProbeRequestType Type,  desc="TD_PrbNxtState signal";
+  bool ReturnData,        desc="Indicates CPU should return data";
+  bool localCtoD,         desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+  NetDest Destination,    desc="Node to whom the data is sent";
+  MessageSizeType MessageSize, desc="size category of the message";
+  int Phase,              desc="Synchronization Phase";
+  int wfid,               desc="wavefront id for Release";
+  MachineID Requestor,    desc="Node who initiated the request";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+  Addr addr,             desc="Physical address for this request";
+  CoherenceResponseType Type,  desc="NB Sys Resp or CPU Response to Probe";
+  MachineID Sender,               desc="Node who sent the data";
+  NetDest Destination,             desc="Node to whom the data is sent";
+  // Begin Used Only By CPU Response
+  DataBlock DataBlk,           desc="data for the cache line";
+  bool Hit,                    desc="probe hit valid line";
+  bool Shared,                 desc="True if S, or if NB Probe ReturnData==1 && O";
+  bool Dirty,                  desc="Is the data dirty (different than memory)?";
+  bool Ntsl,                   desc="indicates probed lin will be invalid after probe";
+  bool UntransferredOwner,     desc="pending confirmation of ownership change";
+  // End Used Only By CPU Response
+
+  // Begin NB Response Only
+  CoherenceState State, default=CoherenceState_NA,        desc="What returned data from NB should be in";
+  bool CtoD,                    desc="was the originator a CtoD?";
+  // End NB Response Only
+
+  // Normally if a block gets hit by a probe while waiting to be written back,
+  // you flip the NbReqShared signal (part of the CPURequest signal group).
+  // But since this is in packets and I don't want to send a separate packet,
+  // let's just send this signal back with the data instead
+  bool NbReqShared,             desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+  MessageSizeType MessageSize, desc="size category of the message";
+  Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+  Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+  Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+  bool DemandRequest, default="false", desc="For profiling purposes";
+
+  bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+  MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+  MachineID WTRequestor,             desc="Node who started the writethrough";
+
+  bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+  bool NoAckNeeded, default="false", desc="For short circuting acks";
+  bool isValid, default="false", desc="Is acked block valid";
+  int wfid, default="0", desc="wavefront id";
+  int Phase,                   desc="Synchronization Phase";
+
+  int ProgramCounter,       desc="PC that issues this request";
+  bool mispred,              desc="tell TCP if the block should not be bypassed";
+
+
+  bool functionalRead(Packet *pkt) {
+    // Only PUTX messages contains the data block
+    if (Type == CoherenceResponseType:CPUData ||
+        Type == CoherenceResponseType:MemData) {
+        return testAndRead(addr, DataBlk, pkt);
+    }
+
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return testAndWrite(addr, DataBlk, pkt);
+  }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Physical address for this request";
+  NetDest Destination,          desc="Destination (always directory)";
+  MessageSizeType MessageSize, desc="size category of the message";
+  MachineID Sender,               desc="Node who sent the data";
+  bool currentOwner, default="false", desc="Is the sender the current owner";
+  bool DoneAck, default="false", desc="Is this a done ack?";
+  bool Dirty, default="false", desc="Was block dirty when evicted";
+  bool wasValid, default="false", desc="Was block valid when evicted";
+  bool valid, default="false", desc="Is block valid";
+  bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+  L2_to_L1,             desc="L2 to L1 fill";
+  AcksComplete,         desc="NB received all needed Acks";
+
+  // For regions
+  InvNext,              desc="Invalidate the next block";
+  PrivateAck,           desc="Loopback ack for machines with no Region Buffer";
+  AllOutstanding,       desc="All outstanding requests have finished";
+  L3Hit,                desc="L3 hit in dir";
+
+  // For region directory once the directory is blocked
+  InvRegion,            desc="Invalidate region";
+  DowngradeRegion,      desc="downgrade region";
+  //For writethrough
+  UnblockWriteThrough,  desc="unblock";
+  WriteData,            desc="Write to full cacheblock data";
+  WriteDone,            desc="Sequencer says that write is done";
+  AtomicDone,           desc="Atomic is done";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+  L1I,          desc="L1 I-cache";
+  L1D0,         desc="L1 D-cache cluster 0";
+  L1D1,         desc="L1 D-cache cluster 1";
+  NA,           desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+  Addr addr,              desc="Address";
+  TriggerType Type,             desc="Type of trigger";
+  CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
+  int ProgramCounter,           desc="PC that accesses to this block";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
+
+enumeration(FifoType, desc="Fifo Type") {
+  WriteDummy,          desc="Dummy Write for atomic operation";
+  WriteThrough,        desc="simple writethrough request";
+  WriteFlush,          desc="synchronization message";
+}
+
+structure(FifoMsg, desc="...", interface="Message") {
+  Addr addr,          desc="Address";
+  FifoType Type,            desc="WriteThrough/WriteFlush";
+  int wfid,                 default="0",desc="wavefront id";
+  MachineID Requestor,      desc="Flush Requestor";
+  MachineID oRequestor,      desc="original Flush Requestor";
+
+  bool functionalRead(Packet *pkt) {
+    return false;
+  }
+
+  bool functionalWrite(Packet *pkt) {
+    // No check on message type required since the protocol should
+    // read data from those messages that contain the block
+    return false;
+  }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
new file mode 100644
index 000000000..f545c2fa7
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu, Sooraj Puthoor
+ */
+
+/*
+ * This file is based on MOESI_AMD_Base.sm
+ * Differences with AMD base protocol
+ * -- Uses a probe filter memory to track sharers.
+ * -- The probe filter can be inclusive or non-inclusive
+ * -- Only two sharers tracked. Sharers are a) GPU or/and  b) CPU
+ * -- If sharer information available, the sharer is probed
+ * -- If sharer information not available, probes are broadcasted
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+  CacheMemory * L3CacheMemory;
+  CacheMemory * ProbeFilterMemory;
+  Cycles response_latency := 5;
+  Cycles l3_hit_latency := 50;
+  bool noTCCdir := "False";
+  bool CAB_TCC := "False";
+  int TCC_select_num_bits:=1;
+  bool useL3OnWT := "False";
+  bool inclusiveDir := "True";
+  Cycles to_memory_controller_latency := 1;
+
+  // From the Cores
+  MessageBuffer * requestFromCores, network="From", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseFromCores, network="From", virtual_network="2", ordered="false", vnet_type="response";
+  MessageBuffer * unblockFromCores, network="From", virtual_network="4", ordered="false", vnet_type="unblock";
+
+  MessageBuffer * probeToCore, network="To", virtual_network="0", ordered="false", vnet_type="request";
+  MessageBuffer * responseToCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+  MessageBuffer * triggerQueue, ordered="true";
+  MessageBuffer * L3triggerQueue, ordered="true";
+  MessageBuffer * responseFromMemory;
+{
+  // STATES
+  state_declaration(State, desc="Directory states", default="Directory_State_U") {
+    U, AccessPermission:Backing_Store,                 desc="unblocked";
+    BL, AccessPermission:Busy,                  desc="got L3 WB request";
+    // BL is Busy because it is busy waiting for the data
+    // which is possibly in the network. The cache which evicted the data
+    // might have moved to some other state after doing the eviction
+    // BS==> Received a read request; has not requested ownership
+    // B==> Received a read request; has requested ownership
+    // BM==> Received a modification request
+    B_P, AccessPermission:Backing_Store,      desc="Back invalidation, waiting for probes";
+    BS_M, AccessPermission:Backing_Store,     desc="blocked waiting for memory";
+    BM_M, AccessPermission:Backing_Store,     desc="blocked waiting for memory";
+    B_M, AccessPermission:Backing_Store,      desc="blocked waiting for memory";
+    BP, AccessPermission:Backing_Store,       desc="blocked waiting for probes, no need for memory";
+    BS_PM, AccessPermission:Backing_Store,    desc="blocked waiting for probes and Memory";
+    BM_PM, AccessPermission:Backing_Store,    desc="blocked waiting for probes and Memory";
+    B_PM, AccessPermission:Backing_Store,     desc="blocked waiting for probes and Memory";
+    BS_Pm, AccessPermission:Backing_Store,    desc="blocked waiting for probes, already got memory";
+    BM_Pm, AccessPermission:Backing_Store,    desc="blocked waiting for probes, already got memory";
+    B_Pm, AccessPermission:Backing_Store,     desc="blocked waiting for probes, already got memory";
+    B, AccessPermission:Backing_Store,        desc="sent response, Blocked til ack";
+  }
+
+  // Events
+  enumeration(Event, desc="Directory events") {
+    // CPU requests
+    RdBlkS,             desc="...";
+    RdBlkM,             desc="...";
+    RdBlk,              desc="...";
+    CtoD,               desc="...";
+    WriteThrough,       desc="WriteThrough Message";
+    Atomic,             desc="Atomic Message";
+
+    // writebacks
+    VicDirty,           desc="...";
+    VicClean,           desc="...";
+    CPUData,            desc="WB data from CPU";
+    StaleWB,         desc="Notification that WB has been superceded by a probe";
+
+    // probe responses
+    CPUPrbResp,            desc="Probe Response Msg";
+
+    ProbeAcksComplete,  desc="Probe Acks Complete";
+
+    L3Hit,              desc="Hit in L3 return data to core";
+
+    // Replacement
+    PF_Repl,            desc="Replace address from probe filter";
+
+    // Memory Controller
+    MemData, desc="Fetched data from memory arrives";
+    WBAck, desc="Writeback Ack from memory arrives";
+
+    CoreUnblock,            desc="Core received data, unblock";
+    UnblockWriteThrough,    desc="Unblock because of writethrough request finishing";
+
+    StaleVicDirty,        desc="Core invalidated before VicDirty processed";
+  }
+
+  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+    L3DataArrayRead,    desc="Read the data array";
+    L3DataArrayWrite,   desc="Write the data array";
+    L3TagArrayRead,     desc="Read the data array";
+    L3TagArrayWrite,    desc="Write the data array";
+
+    PFTagArrayRead,     desc="Read the data array";
+    PFTagArrayWrite,    desc="Write the data array";
+  }
+
+  // TYPES
+
+  enumeration(ProbeFilterState, desc="") {
+    T,  desc="Tracked";
+    NT, desc="Not tracked";
+    B, desc="Blocked, This entry is being replaced";
+  }
+
+  // DirectoryEntry
+  structure(Entry, desc="...", interface="AbstractEntry") {
+    State DirectoryState,          desc="Directory state";
+    DataBlock DataBlk,             desc="data for the block";
+    NetDest VicDirtyIgnore,  desc="VicDirty coming from whom to ignore";
+  }
+
+  structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+    DataBlock DataBlk,          desc="data for the block";
+    MachineID LastSender,       desc="Mach which this block came from";
+    ProbeFilterState pfState,   desc="ProbeFilter state",default="Directory_ProbeFilterState_NT";
+    bool isOnCPU,               desc="Block valid in the CPU complex",default="false";
+    bool isOnGPU,               desc="Block valid in the GPU complex",default="false";
+  }
+
+  structure(TBE, desc="...") {
+    State TBEState,     desc="Transient state";
+    DataBlock DataBlk,  desc="data for the block";
+    bool Dirty,         desc="Is the data dirty?";
+    int NumPendingAcks,        desc="num acks expected";
+    MachineID OriginalRequestor,        desc="Original Requestor";
+    MachineID WTRequestor,        desc="WT Requestor";
+    bool Cached,        desc="data hit in Cache";
+    bool MemData,       desc="Got MemData?",default="false";
+    bool wtData,       desc="Got write through data?",default="false";
+    bool atomicData,   desc="Got Atomic op?",default="false";
+    Cycles InitialRequestTime, desc="...";
+    Cycles ForwardRequestTime, desc="...";
+    Cycles ProbeRequestStartTime, desc="...";
+    MachineID LastSender, desc="Mach which this block came from";
+    bool L3Hit, default="false", desc="Was this an L3 hit?";
+    uint64_t probe_id,        desc="probe id for lifetime profiling";
+    WriteMask writeMask,    desc="outstanding write through mask";
+    Addr demandAddress,  desc="Address of demand request which caused probe filter eviction";
+  }
+
+  structure(TBETable, external="yes") {
+    TBE lookup(Addr);
+    void allocate(Addr);
+    void deallocate(Addr);
+    bool isPresent(Addr);
+  }
+
+  TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+  int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+  Tick clockEdge();
+  Tick cyclesToTicks(Cycles c);
+
+  void set_tbe(TBE a);
+  void unset_tbe();
+  void wakeUpAllBuffers();
+  void wakeUpBuffers(Addr a);
+  Cycles curCycle();
+
+  Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+    Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+    if (is_valid(dir_entry)) {
+      //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+      return dir_entry;
+    }
+
+    dir_entry :=  static_cast(Entry, "pointer",
+                              directory.allocate(addr, new Entry));
+    return dir_entry;
+  }
+
+  DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+    TBE tbe := TBEs.lookup(addr);
+    if (is_valid(tbe) && tbe.MemData) {
+      DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+      return tbe.DataBlk;
+    }
+    DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+    return getDirectoryEntry(addr).DataBlk;
+  }
+
+  State getState(TBE tbe, CacheEntry entry, Addr addr) {
+    CacheEntry probeFilterEntry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(addr));
+    if (inclusiveDir) {
+      if (is_valid(probeFilterEntry) && probeFilterEntry.pfState == ProbeFilterState:B) {
+        return State:B_P;
+      }
+    }
+    return getDirectoryEntry(addr).DirectoryState;
+  }
+
+  void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).DirectoryState := state;
+  }
+
+  void functionalRead(Addr addr, Packet *pkt) {
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      testAndRead(addr, tbe.DataBlk, pkt);
+    } else {
+      functionalMemoryRead(pkt);
+    }
+  }
+
+  int functionalWrite(Addr addr, Packet *pkt) {
+    int num_functional_writes := 0;
+
+    TBE tbe := TBEs.lookup(addr);
+    if(is_valid(tbe)) {
+      num_functional_writes := num_functional_writes +
+            testAndWrite(addr, tbe.DataBlk, pkt);
+    }
+
+    num_functional_writes := num_functional_writes +
+        functionalMemoryWrite(pkt);
+    return num_functional_writes;
+  }
+
+  AccessPermission getAccessPermission(Addr addr) {
+    // For this Directory, all permissions are just tracked in Directory, since
+    // it's not possible to have something in TBE but not Dir, just keep track
+    // of state all in one place.
+    if (directory.isPresent(addr)) {
+      return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+    }
+
+    return AccessPermission:NotPresent;
+  }
+
+  void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+    getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+  }
+
+  void recordRequestType(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:PFTagArrayRead) {
+      ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+    } else if (request_type == RequestType:PFTagArrayWrite) {
+      ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    }
+  }
+
+  bool checkResourceAvailable(RequestType request_type, Addr addr) {
+    if (request_type == RequestType:L3DataArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3DataArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+    } else if (request_type == RequestType:L3TagArrayRead) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:L3TagArrayWrite) {
+      return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:PFTagArrayRead) {
+      return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:PFTagArrayWrite) {
+      return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else {
+      error("Invalid RequestType type in checkResourceAvailable");
+      return true;
+    }
+  }
+
+  bool isNotPresentProbeFilter(Addr address) {
+    if (ProbeFilterMemory.isTagPresent(address) ||
+        ProbeFilterMemory.cacheAvail(address)) {
+        return false;
+    }
+    return true;
+  }
+
+  bool isGPUSharer(Addr address) {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    if (entry.pfState == ProbeFilterState:NT) {
+       return true;
+    } else if (entry.isOnGPU){
+       return true;
+    }
+    return false;
+  }
+
+  bool isCPUSharer(Addr address) {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    if (entry.pfState == ProbeFilterState:NT) {
+       return true;
+    } else if (entry.isOnCPU){
+       return true;
+    }
+    return false;
+  }
+
+
+  // ** OUT_PORTS **
+  out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+  out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+  out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+  out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+  // ** IN_PORTS **
+
+  // Trigger Queue
+  in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+    if (triggerQueue_in.isReady(clockEdge())) {
+      peek(triggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:AcksComplete) {
+          trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+        }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+          trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+    if (L3TriggerQueue_in.isReady(clockEdge())) {
+      peek(L3TriggerQueue_in, TriggerMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == TriggerType:L3Hit) {
+          trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+        } else {
+          error("Unknown trigger msg");
+        }
+      }
+    }
+  }
+
+  // Unblock Network
+  in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+    if (unblockNetwork_in.isReady(clockEdge())) {
+      peek(unblockNetwork_in, UnblockMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+      }
+    }
+  }
+
+  // Core response network
+  in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+    if (responseNetwork_in.isReady(clockEdge())) {
+      peek(responseNetwork_in, ResponseMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+          trigger(Event:CPUData, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+        } else {
+          error("Unexpected response type");
+        }
+      }
+    }
+  }
+
+  // off-chip memory request/response is done
+  in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+    if (memQueue_in.isReady(clockEdge())) {
+      peek(memQueue_in, MemoryMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+          trigger(Event:MemData, in_msg.addr, entry, tbe);
+          DPRINTF(RubySlicc, "%s\n", in_msg);
+        } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+          trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+        } else {
+          DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+          error("Invalid message");
+        }
+      }
+    }
+  }
+
+  in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+    if (requestNetwork_in.isReady(clockEdge())) {
+      peek(requestNetwork_in, CPURequestMsg) {
+        TBE tbe := TBEs.lookup(in_msg.addr);
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+        if (inclusiveDir && isNotPresentProbeFilter(in_msg.addr)) {
+            Addr victim := ProbeFilterMemory.cacheProbe(in_msg.addr);
+            tbe := TBEs.lookup(victim);
+            entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(victim));
+            trigger(Event:PF_Repl, victim, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+          trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+          trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+          trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+          trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+          trigger(Event:Atomic, in_msg.addr, entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+          }
+        } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+          if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+            DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+            trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+          } else {
+            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:VicClean, in_msg.addr, entry, tbe);
+          }
+        } else {
+          error("Bad request message type");
+        }
+      }
+    }
+  }
+
+  // Actions
+  action(s_sendResponseS, "s", desc="send Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := false;
+      out_msg.State := CoherenceState:Shared;
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+    enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:NBSysResp;
+      if (tbe.L3Hit) {
+        out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+      } else {
+        out_msg.Sender := machineID;
+      }
+      out_msg.Destination.add(tbe.OriginalRequestor);
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.MessageSize := MessageSizeType:Response_Data;
+      out_msg.Dirty := tbe.Dirty;
+      if (tbe.Cached) {
+        out_msg.State := CoherenceState:Shared;
+      } else {
+        out_msg.State := CoherenceState:Exclusive;
+      }
+      out_msg.InitialRequestTime := tbe.InitialRequestTime;
+      out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+      out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.L3Hit := tbe.L3Hit;
+      DPRINTF(RubySlicc, "%s\n", out_msg);
+    }
+  }
+
+  // write-through and atomics do not send an unblock ack back to the
+  // directory. Hence, directory has to generate a self unblocking
+  // message. Additionally, write through's does not require data
+  // in its response. Hence, write through is treated seperately from
+  // write-back and atomics
+  action(m_sendResponseM, "m", desc="send Modified response") {
+    if (tbe.wtData) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:UnblockWriteThrough;
+      }
+    }else{
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        if (tbe.L3Hit) {
+          out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+        } else {
+          out_msg.Sender := machineID;
+        }
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := false;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.OriginalResponder := tbe.LastSender;
+        if(tbe.atomicData){
+          out_msg.WTRequestor := tbe.WTRequestor;
+        }
+        out_msg.L3Hit := tbe.L3Hit;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+      if (tbe.atomicData) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:UnblockWriteThrough;
+        }
+      }
+    }
+  }
+
+  action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+      enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysResp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.Dirty := false;
+        out_msg.State := CoherenceState:Modified;
+        out_msg.CtoD := true;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+  }
+
+  action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      queueMemoryWrite(machineID, address, to_memory_controller_latency,
+                       in_msg.DataBlk);
+    }
+  }
+
+  action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:L3Hit;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        tbe.DataBlk := entry.DataBlk;
+        tbe.LastSender := entry.LastSender;
+        tbe.L3Hit := true;
+        tbe.MemData := true;
+        L3CacheMemory.deallocate(address);
+      } else {
+        queueMemoryRead(machineID, address, to_memory_controller_latency);
+      }
+    }
+  }
+
+  action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+
+        // add relevant TCC node to list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if ((in_msg.Type == CoherenceRequestType:WriteThrough ||
+               in_msg.Type == CoherenceRequestType:Atomic) &&
+               in_msg.NoWriteConflict) {
+          // Don't Include TCCs unless there was write-CAB conflict in the TCC
+          } else if(noTCCdir) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits));
+          } else {
+	        out_msg.Destination.add(map_Address_to_TCCdir(address));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(bp_backProbe, "bp", desc="back probe") {
+    enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+      out_msg.addr := address;
+      out_msg.Type := ProbeRequestType:PrbInv;
+      out_msg.ReturnData := true;
+      out_msg.MessageSize := MessageSizeType:Control;
+      if(isCPUSharer(address)) {
+        // won't be realistic for multisocket
+        out_msg.Destination.broadcast(MachineType:CorePair);
+      }
+      // add relevant TCC node to the list. This replaces all TCPs and SQCs
+      if(isGPUSharer(address)) {
+        if (noTCCdir) {
+          //Don't need to notify TCC about reads
+        } else {
+          out_msg.Destination.add(map_Address_to_TCCdir(address));
+          tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+        }
+        if (noTCCdir && CAB_TCC) {
+          out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                  TCC_select_low_bit, TCC_select_num_bits));
+        }
+      }
+      tbe.NumPendingAcks := out_msg.Destination.count();
+      if (tbe.NumPendingAcks == 0) {
+        enqueue(triggerQueue_out, TriggerMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Type := TriggerType:AcksComplete;
+        }
+      }
+      DPRINTF(RubySlicc, "%s\n", (out_msg));
+      APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+      APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+      APPEND_TRANSITION_COMMENT(" - back probe");
+      tbe.ProbeRequestStartTime := curCycle();
+    }
+  }
+
+  action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbDowngrade;
+        out_msg.ReturnData := true;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if (noTCCdir) {
+            //Don't need to notify TCC about reads
+          } else {
+	    out_msg.Destination.add(map_Address_to_TCCdir(address));
+            tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+          }
+          if (noTCCdir && CAB_TCC) {
+            out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                    TCC_select_low_bit, TCC_select_num_bits));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        DPRINTF(RubySlicc, "%s\n", (out_msg));
+        APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+    peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+      enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := ProbeRequestType:PrbInv;
+        out_msg.ReturnData := false;
+        out_msg.MessageSize := MessageSizeType:Control;
+        if(isCPUSharer(address)) {
+          out_msg.Destination.broadcast(MachineType:CorePair);  // won't be realistic for multisocket
+        }
+
+        // add relevant TCC node to the list. This replaces all TCPs and SQCs
+        if(isGPUSharer(address)) {
+          if (noTCCdir) {
+              out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+          } else {
+	          out_msg.Destination.add(map_Address_to_TCCdir(address));
+          }
+        }
+        out_msg.Destination.remove(in_msg.Requestor);
+        tbe.NumPendingAcks := out_msg.Destination.count();
+        if (tbe.NumPendingAcks == 0) {
+          enqueue(triggerQueue_out, TriggerMsg, 1) {
+            out_msg.addr := address;
+            out_msg.Type := TriggerType:AcksComplete;
+          }
+        }
+        APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+        APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+        tbe.ProbeRequestStartTime := curCycle();
+      }
+    }
+  }
+
+  action(sm_setMRU, "sm", desc="set probe filter entry as MRU") {
+    ProbeFilterMemory.setMRU(address);
+  }
+
+  action(d_writeDataToMemory, "d", desc="Write data to memory") {
+    peek(responseNetwork_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+              in_msg.addr);
+    }
+  }
+
+  action(te_allocateTBEForEviction, "te", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+      tbe.writeMask.clear();
+      tbe.wtData := false;
+      tbe.atomicData := false;
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      tbe.NumPendingAcks := 0;
+  }
+
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    peek(requestNetwork_in, CPURequestMsg) {
+      TBEs.allocate(address);
+      set_tbe(TBEs.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.wtData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      if (in_msg.Type == CoherenceRequestType:Atomic) {
+        tbe.writeMask.clear();
+        tbe.writeMask.orMask(in_msg.writeMask);
+        tbe.atomicData := true;
+        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.LastSender := in_msg.Requestor;
+      }
+      tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+      tbe.Dirty := false;
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+        tbe.Dirty := false;
+      }
+      tbe.OriginalRequestor := in_msg.Requestor;
+      tbe.NumPendingAcks := 0;
+      tbe.Cached := in_msg.ForceShared;
+      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+    }
+  }
+
+  action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+    if (tbe.Dirty == false) {
+        getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(wd_writeBackData, "wd", desc="Write back data if needed") {
+    if (tbe.wtData) {
+      DataBlock tmp := getDirectoryEntry(address).DataBlk;
+      tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+      tbe.DataBlk := tmp;
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.atomicData) {
+      tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,
+                                tbe.writeMask);
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    } else if (tbe.Dirty == false) {
+      getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+    }
+  }
+
+  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+    peek(memQueue_in, MemoryMsg) {
+      if (tbe.wtData == true) {
+        // DO Nothing (already have the directory data)
+      } else if (tbe.Dirty == false) {
+        tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+      }
+      tbe.MemData := true;
+    }
+  }
+
+  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (in_msg.Dirty) {
+        DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+        DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+        if (tbe.wtData) {
+          DataBlock tmp := in_msg.DataBlk;
+          tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+          tbe.DataBlk := tmp;
+        } else if (tbe.Dirty) {
+          if(tbe.atomicData == false && tbe.wtData == false) {
+            DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+            assert(tbe.DataBlk == in_msg.DataBlk);  // in case of double data
+          }
+        } else {
+          tbe.DataBlk := in_msg.DataBlk;
+          tbe.Dirty := in_msg.Dirty;
+          tbe.LastSender := in_msg.Sender;
+        }
+      }
+      if (in_msg.Hit) {
+        tbe.Cached := true;
+      }
+    }
+  }
+
+  action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+    peek(responseNetwork_in, ResponseMsg) {
+      DPRINTF(RubySlicc, "Write cancel bit set on address %s\n", address);
+      getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+      APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+    }
+  }
+
+  action(x_decrementAcks, "x", desc="decrement Acks pending") {
+    tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+    APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(o_checkForCompletion, "o", desc="check for ack completion") {
+    if (tbe.NumPendingAcks == 0) {
+      enqueue(triggerQueue_out, TriggerMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := TriggerType:AcksComplete;
+      }
+    }
+    APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+    APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+  }
+
+  action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+    }
+  }
+
+  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+    peek(responseNetwork_in, ResponseMsg) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := in_msg.DataBlk;
+        entry.LastSender := in_msg.Sender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := in_msg.DataBlk;
+
+        entry.LastSender := in_msg.Sender;
+      }
+    }
+  }
+
+  action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+    if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+      if (L3CacheMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      } else {
+        if (L3CacheMemory.cacheAvail(address) == false) {
+          Addr victim := L3CacheMemory.cacheProbe(address);
+          CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+                                                 L3CacheMemory.lookup(victim));
+          queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+                           victim_entry.DataBlk);
+          L3CacheMemory.deallocate(victim);
+        }
+        assert(L3CacheMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+        entry.DataBlk := tbe.DataBlk;
+        entry.LastSender := tbe.LastSender;
+      }
+    }
+  }
+
+  action(apf_allocateProbeFilterEntry, "apf", desc="Allocate probe filte entry") {
+    if (!ProbeFilterMemory.isTagPresent(address)) {
+        if (inclusiveDir) {
+            assert(ProbeFilterMemory.cacheAvail(address));
+        } else if (ProbeFilterMemory.cacheAvail(address) == false) {
+          Addr victim := ProbeFilterMemory.cacheProbe(address);
+          ProbeFilterMemory.deallocate(victim);
+        }
+        assert(ProbeFilterMemory.cacheAvail(address));
+        CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.allocate(address, new CacheEntry));
+        APPEND_TRANSITION_COMMENT(" allocating a new probe filter entry");
+        entry.pfState := ProbeFilterState:NT;
+        if (inclusiveDir) {
+          entry.pfState := ProbeFilterState:T;
+        }
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+    }
+  }
+
+  action(mpfe_markPFEntryForEviction, "mpfe", desc="Mark this PF entry is being evicted") {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+    entry.pfState := ProbeFilterState:B;
+    peek(requestNetwork_in, CPURequestMsg) {
+      tbe.demandAddress := in_msg.addr;
+    }
+  }
+
+  action(we_wakeUpEvictionDependents, "we", desc="Wake up requests waiting for demand address and victim address") {
+    wakeUpBuffers(address);
+    wakeUpBuffers(tbe.demandAddress);
+  }
+
+  action(dpf_deallocateProbeFilter, "dpf", desc="deallocate PF entry") {
+    assert(ProbeFilterMemory.isTagPresent(address));
+    ProbeFilterMemory.deallocate(address);
+  }
+
+  action(upf_updateProbeFilter, "upf", desc="") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      assert(ProbeFilterMemory.isTagPresent(address));
+      CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+      if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      } else if (in_msg.Type == CoherenceRequestType:CtoD) {
+        entry.pfState := ProbeFilterState:T;
+        entry.isOnCPU := false;
+        entry.isOnGPU := false;
+      }
+      if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {
+        entry.isOnCPU := true;
+      } else {
+        entry.isOnGPU := true;
+      }
+    }
+  }
+
+  action(rmcd_removeSharerConditional, "rmcd", desc="remove sharer from probe Filter, conditional") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      if (ProbeFilterMemory.isTagPresent(address)) {
+        CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+        if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {//CorePair has inclusive L2
+          if (in_msg.Type == CoherenceRequestType:VicDirty) {
+            entry.isOnCPU := false;
+          } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+            entry.isOnCPU := false;
+          }
+        }
+      }
+    }
+  }
+
+  action(sf_setForwardReqTime, "sf", desc="...") {
+    tbe.ForwardRequestTime := curCycle();
+  }
+
+  action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+    L3CacheMemory.deallocate(address);
+  }
+
+  action(p_popRequestQueue, "p", desc="pop request queue") {
+    requestNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pr_popResponseQueue, "pr", desc="pop response queue") {
+    responseNetwork_in.dequeue(clockEdge());
+  }
+
+  action(pm_popMemQueue, "pm", desc="pop mem queue") {
+    memQueue_in.dequeue(clockEdge());
+  }
+
+  action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+    triggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+    L3TriggerQueue_in.dequeue(clockEdge());
+  }
+
+  action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+    unblockNetwork_in.dequeue(clockEdge());
+  }
+
+  action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+    requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+    responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(requestNetwork_in, address);
+  }
+
+  action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+    wakeUpBuffers(address);
+  }
+
+  action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+    wakeUpAllBuffers();
+  }
+
+  action(z_stall, "z", desc="...") {
+  }
+
+  // TRANSITIONS
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+    st_stallAndWaitRequest;
+  }
+
+  // It may be possible to save multiple invalidations here!
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {Atomic, WriteThrough}) {
+    st_stallAndWaitRequest;
+  }
+
+
+  // transitions from U
+  transition(U, PF_Repl, B_P) {PFTagArrayRead, PFTagArrayWrite}{
+    te_allocateTBEForEviction;
+    apf_allocateProbeFilterEntry;
+    bp_backProbe;
+    sm_setMRU;
+    mpfe_markPFEntryForEviction;
+  }
+
+  transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    w_sendResponseWBAck;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    dc_probeInvCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, RdBlk, B_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite}{
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    l_queueMemRdReq;
+    sc_probeShrCoreData;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, CtoD, BP) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+    t_allocateTBE;
+    apf_allocateProbeFilterEntry;
+    ic_probeInvCore;
+    sm_setMRU;
+    upf_updateProbeFilter;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicDirty, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    rmcd_removeSharerConditional;
+    p_popRequestQueue;
+  }
+
+  transition(U, VicClean, BL) {L3TagArrayRead} {
+    t_allocateTBE;
+    w_sendResponseWBAck;
+    rmcd_removeSharerConditional;
+    p_popRequestQueue;
+  }
+
+  transition(BL, {VicDirty, VicClean}) {
+    zz_recycleRequestQueue;
+  }
+
+  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    wa_wakeUpDependents;
+    dt_deallocateTBE;
+    //l_queueMemWBReq;  // why need an ack?  esp. with DRAMSim, just put it in queue no ack needed
+    pr_popResponseQueue;
+  }
+
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
+    dt_deallocateTBE;
+    wa_wakeUpAllDependents;
+    pr_popResponseQueue;
+  }
+
+  transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P}, {VicDirty, VicClean}) {
+    z_stall;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, WBAck) {
+    pm_popMemQueue;
+  }
+
+  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, PF_Repl) {
+    zz_recycleRequestQueue;
+  }
+
+  transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, StaleVicDirty) {
+    rv_removeVicDirtyIgnore;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+  }
+
+  transition({B}, CoreUnblock, U) {
+    wa_wakeUpDependents;
+    pu_popUnblockQueue;
+  }
+
+  transition(B, UnblockWriteThrough, U) {
+    wa_wakeUpDependents;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_PM, MemData, BS_Pm) {} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_PM, MemData, BM_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_PM, MemData, B_Pm){} {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_PM, L3Hit, BS_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_PM, L3Hit, BM_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_PM, L3Hit, B_Pm) {} {
+    ptl_popTriggerQueue;
+  }
+
+  transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+    mt_writeMemDataToTBE;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
+  transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    ptl_popTriggerQueue;
+  }
+
+  transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, BP}, CPUPrbResp) {
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_PM, ProbeAcksComplete, B_M){} {
+    sf_setForwardReqTime;
+    pt_popTriggerQueue;
+  }
+
+  transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    s_sendResponseS;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    m_sendResponseM;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    es_sendResponseES;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(B_P, ProbeAcksComplete, U) {
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    we_wakeUpEvictionDependents;
+    dpf_deallocateProbeFilter;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+
+  transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+    sf_setForwardReqTime;
+    c_sendResponseCtoD;
+    wd_writeBackData;
+    alwt_allocateL3BlockOnWT;
+    dt_deallocateTBE;
+    pt_popTriggerQueue;
+  }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base.slicc b/src/mem/protocol/MOESI_AMD_Base.slicc
new file mode 100644
index 000000000..b38145246
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base.slicc
@@ -0,0 +1,6 @@
+protocol "MOESI_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-dir.sm";
diff --git a/src/mem/protocol/RubySlicc_ComponentMapping.sm b/src/mem/protocol/RubySlicc_ComponentMapping.sm
index a72492b42..e1d7c4399 100644
--- a/src/mem/protocol/RubySlicc_ComponentMapping.sm
+++ b/src/mem/protocol/RubySlicc_ComponentMapping.sm
@@ -37,7 +37,10 @@ MachineID mapAddressToRange(Addr addr, MachineType type,
 NetDest broadcast(MachineType type);
 MachineID map_Address_to_DMA(Addr addr);
 MachineID map_Address_to_Directory(Addr addr);
+MachineID map_Address_to_RegionDir(Addr addr);
 NodeID map_Address_to_DirectoryNode(Addr addr);
+MachineID map_Address_to_TCCdir(Addr addr);
+NodeID map_Address_to_TCCdirNode(Addr addr);
 NodeID machineIDToNodeID(MachineID machID);
 NodeID machineIDToVersion(MachineID machID);
 MachineType machineIDToMachineType(MachineID machID);
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
index 5ee26d65c..c743ebe28 100644
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -62,7 +62,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt);
 
 // AccessPermission
 // The following five states define the access permission of all memory blocks.
-// These permissions have multiple uses.  They coordinate locking and 
+// These permissions have multiple uses.  They coordinate locking and
 // synchronization primitives, as well as enable functional accesses.
 // One should not need to add any additional permission values and it is very
 // risky to do so.
@@ -73,7 +73,7 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
   Read_Write, desc="block is Read/Write";
 
   // Possibly Invalid data
-  // The maybe stale permission indicates that accordingly to the protocol, 
+  // The maybe stale permission indicates that accordingly to the protocol,
   // there is no guarantee the block contains valid data.  However, functional
   // writes should update the block because a dataless PUT request may
   // revalidate the block's data.
@@ -227,6 +227,13 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") {
     Collector,   desc="Collector Mach";
     L1Cache_wCC, desc="L1 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
     L2Cache_wCC, desc="L2 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
+    CorePair,    desc="Cache Mach (2 cores, Private L1Ds, Shared L1I & L2)";
+    TCP,         desc="GPU L1 Data Cache (Texture Cache per Pipe)";
+    TCC,         desc="GPU L2 Shared Cache (Texture Cache per Channel)";
+    TCCdir,      desc="Directory at the GPU L2 Cache (TCC)";
+    SQC,         desc="GPU L1 Instr Cache (Sequencer Cache)";
+    RegionDir,   desc="Region-granular directory";
+    RegionBuffer,desc="Region buffer for CPU and GPU";
     NULL,        desc="null mach type";
 }
 
diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm
index a6c57e1b0..b8d284725 100644
--- a/src/mem/protocol/RubySlicc_Types.sm
+++ b/src/mem/protocol/RubySlicc_Types.sm
@@ -31,8 +31,8 @@
 
 //
 // **PLEASE NOTE!**  When adding objects to this file you must also add a line
-// in the src/mem/ruby/SConscript file.  Otherwise the external object's .hh 
-// file will not be copied to the protocol directory and you will encounter a 
+// in the src/mem/ruby/SConscript file.  Otherwise the external object's .hh
+// file will not be copied to the protocol directory and you will encounter a
 // undefined declaration error.
 //
 
@@ -95,6 +95,8 @@ structure (NetDest, external = "yes", non_obj="yes") {
   bool intersectionIsEmpty(Set);
   bool intersectionIsEmpty(NetDest);
   MachineID smallestElement(MachineType);
+  NetDest OR(NetDest);
+  NetDest AND(NetDest);
 }
 
 structure (Sequencer, external = "yes") {
@@ -117,6 +119,44 @@ structure (Sequencer, external = "yes") {
   void invalidateSC(Addr);
 }
 
+structure (GPUCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void checkCoherence(Addr);
+  void evictionCallback(Addr);
+  void recordCPReadCallBack(MachineID, MachineID);
+  void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void invCallback(Addr);
+  void wbCallback(Addr);
+  void checkCoherence(Addr);
+  void evictionCallback(Addr);
+}
+
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr LineAddress,       desc="Line address for this request";
   Addr PhysicalAddress,   desc="Physical address for this request";
@@ -161,6 +201,7 @@ structure (CacheMemory, external = "yes") {
   Cycles getTagLatency();
   Cycles getDataLatency();
   void setMRU(Addr);
+  void setMRU(Addr, int);
   void setMRU(AbstractCacheEntry);
   void recordRequestType(CacheRequestType, Addr);
   bool checkResourceAvailable(CacheResourceType, Addr);
diff --git a/src/mem/protocol/SConsopts b/src/mem/protocol/SConsopts
index ca432a73e..47b36e276 100644
--- a/src/mem/protocol/SConsopts
+++ b/src/mem/protocol/SConsopts
@@ -33,6 +33,11 @@ import os
 Import('*')
 
 all_protocols.extend([
+    'GPU_VIPER',
+    'GPU_VIPER_Baseline',
+    'GPU_VIPER_Region',
+    'GPU_RfO',
+    'MOESI_AMD_Base',
     'MESI_Two_Level',
     'MESI_Three_Level',
     'MI_example',
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index 16e932432..82a16c9b0 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -124,13 +124,20 @@ MakeInclude('common/Set.hh')
 MakeInclude('common/WriteMask.hh')
 MakeInclude('filters/AbstractBloomFilter.hh')
 MakeInclude('network/MessageBuffer.hh')
-MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/CacheMemory.hh')
-MakeInclude('system/DMASequencer.hh')
 MakeInclude('structures/DirectoryMemory.hh')
-MakeInclude('structures/WireBuffer.hh')
 MakeInclude('structures/PerfectCacheMemory.hh')
 MakeInclude('structures/PersistentTable.hh')
-MakeInclude('system/Sequencer.hh')
+MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/TBETable.hh')
 MakeInclude('structures/TimerTable.hh')
+MakeInclude('structures/WireBuffer.hh')
+MakeInclude('system/DMASequencer.hh')
+MakeInclude('system/Sequencer.hh')
+
+# External types : Group "mem/protocol" : include "header.hh" to the bottom
+# of this MakeIncludes if it is referenced as
+# <# include "mem/protocol/header.hh"> in any file
+# generated_dir = Dir('../protocol')
+MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/VIPERCoalescer.hh')
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
index b3b37e5a6..7d3f20982 100644
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -269,7 +269,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 m_outstandReqHist.add(seq->getOutstandReqHist());
             }
@@ -282,7 +282,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 // add all the latencies
                 m_latencyHist.add(seq->getLatencyHist());
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 926556781..cbd068c04 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry
     virtual DataBlock& getDataBlk()
     { panic("getDataBlk() not implemented!"); }
 
+    int validBlocks;
+    virtual int& getNumValidBlocks()
+    {
+        return validBlocks;
+    }
+
     // Functions for locking and unlocking the cache entry.  These are required
     // for supporting atomic memory accesses.
     void setLocked(int context);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 93fe50c88..458fde5bc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr)
     }
 }
 
+bool
+AbstractController::isBlocked(Addr addr)
+{
+    return (m_block_map.count(addr) > 0);
+}
+
 BaseMasterPort &
 AbstractController::getMasterPort(const std::string &if_name,
                                   PortID idx)
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 383507eed..4488ee3f4 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer
     // return instance name
     void blockOnQueue(Addr, MessageBuffer*);
     void unblock(Addr);
+    bool isBlocked(Addr);
 
     virtual MessageBuffer* getMandatoryQueue() const = 0;
     virtual MessageBuffer* getMemoryQueue() const = 0;
@@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer
     virtual void regStats();
 
     virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0;
-    virtual Sequencer* getSequencer() const = 0;
+    virtual Sequencer* getCPUSequencer() const = 0;
 
     //! These functions are used by ruby system to read/write the data blocks
     //! that exist with in the controller.
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 46071335e..cdedc2e14 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr)
     return DirectoryMemory::mapAddressToDirectoryVersion(addr);
 }
 
+inline NodeID
+map_Address_to_TCCdirNode(Addr addr)
+{
+    return DirectoryMemory::mapAddressToDirectoryVersion(addr);
+}
+
 // used to determine the home directory
 // returns a value between 0 and total_directories_within_the_system
 inline MachineID
@@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr)
     return mach;
 }
 
+inline MachineID
+map_Address_to_RegionDir(Addr addr)
+{
+    MachineID mach = {MachineType_RegionDir,
+                      map_Address_to_DirectoryNode(addr)};
+    return mach;
+}
+
+inline MachineID
+map_Address_to_TCCdir(Addr addr)
+{
+    MachineID mach =
+        {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)};
+    return mach;
+}
+
 inline NetDest
 broadcast(MachineType type)
 {
@@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id)
     return mach;
 }
 
+inline MachineID
+MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node)
+{
+    MachineID mach = {type, node};
+    return mach;
+}
+
 #endif  // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index a8a3ba949..45fb85d05 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -35,6 +35,7 @@
 #include "mem/protocol/AccessPermission.hh"
 #include "mem/ruby/structures/CacheMemory.hh"
 #include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
 
 using namespace std;
 
@@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p)
     m_start_index_bit = p->start_index_bit;
     m_is_instruction_only_cache = p->is_icache;
     m_resource_stalls = p->resourceStalls;
+    m_block_size = p->block_size;  // may be 0 at this point. Updated in init()
 }
 
 void
 CacheMemory::init()
 {
-    m_cache_num_sets = (m_cache_size / m_cache_assoc) /
-        RubySystem::getBlockSizeBytes();
+    if (m_block_size == 0) {
+        m_block_size = RubySystem::getBlockSizeBytes();
+    }
+    m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
     assert(m_cache_num_sets > 1);
     m_cache_num_set_bits = floorLog2(m_cache_num_sets);
     assert(m_cache_num_set_bits > 0);
 
-    m_cache.resize(m_cache_num_sets);
-    for (int i = 0; i < m_cache_num_sets; i++) {
-        m_cache[i].resize(m_cache_assoc);
-        for (int j = 0; j < m_cache_assoc; j++) {
-            m_cache[i][j] = NULL;
-        }
-    }
+    m_cache.resize(m_cache_num_sets,
+                    std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr));
 }
 
 CacheMemory::~CacheMemory()
 {
-    if (m_replacementPolicy_ptr != NULL)
+    if (m_replacementPolicy_ptr)
         delete m_replacementPolicy_ptr;
     for (int i = 0; i < m_cache_num_sets; i++) {
         for (int j = 0; j < m_cache_assoc; j++) {
@@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e)
 }
 
 void
+CacheMemory::setMRU(Addr address, int occupancy)
+{
+    int64_t cacheSet = addressToCacheSet(address);
+    int loc = findTagInSet(cacheSet, address);
+
+    if(loc != -1) {
+        if (m_replacementPolicy_ptr->useOccupancy()) {
+            (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))->
+                touch(cacheSet, loc, curTick(), occupancy);
+        } else {
+            m_replacementPolicy_ptr->
+                touch(cacheSet, loc, curTick());
+        }
+    }
+}
+
+int
+CacheMemory::getReplacementWeight(int64_t set, int64_t loc)
+{
+    assert(set < m_cache_num_sets);
+    assert(loc < m_cache_assoc);
+    int ret = 0;
+    if(m_cache[set][loc] != NULL) {
+        ret = m_cache[set][loc]->getNumValidBlocks();
+        assert(ret >= 0);
+    }
+
+    return ret;
+}
+
+void
 CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
 {
     uint64_t warmedUpBlocks = 0;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 72805b32b..5b30505d3 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -106,7 +106,8 @@ class CacheMemory : public SimObject
 
     // Set this address to most recently used
     void setMRU(Addr address);
-    // Set this entry to most recently used
+    void setMRU(Addr addr, int occupancy);
+    int getReplacementWeight(int64_t set, int64_t loc);
     void setMRU(const AbstractCacheEntry *e);
 
     // Functions for locking and unlocking cache lines corresponding to the
@@ -146,6 +147,7 @@ class CacheMemory : public SimObject
     Stats::Scalar numDataArrayStalls;
 
     int getCacheSize() const { return m_cache_size; }
+    int getCacheAssoc() const { return m_cache_assoc; }
     int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
     Addr getAddressAtIdx(int idx) const;
 
@@ -182,6 +184,7 @@ class CacheMemory : public SimObject
     int m_cache_assoc;
     int m_start_index_bit;
     bool m_resource_stalls;
+    int m_block_size;
 };
 
 std::ostream& operator<<(std::ostream& out, const CacheMemory& obj);
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 4eb87ac74..9fc4726b0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -42,6 +42,7 @@ class RubyCache(SimObject):
                          "")
     start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
     is_icache = Param.Bool(False, "is instruction only cache");
+    block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize")
 
     dataArrayBanks = Param.Int(1, "Number of banks for the data array")
     tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+    return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+    HSAScope accessScope = HSAScope_UNSPECIFIED;
+    if (req->isScoped()) {
+        if (req->isWavefrontScope()) {
+            accessScope = HSAScope_WAVEFRONT;
+        } else if (req->isWorkgroupScope()) {
+            accessScope = HSAScope_WORKGROUP;
+        } else if (req->isDeviceScope()) {
+            accessScope = HSAScope_DEVICE;
+        } else if (req->isSystemScope()) {
+            accessScope = HSAScope_SYSTEM;
+        } else {
+            fatal("Bad scope type");
+        }
+    }
+    return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+    HSASegment accessSegment = HSASegment_GLOBAL;
+
+    if (req->isGlobalSegment()) {
+        accessSegment = HSASegment_GLOBAL;
+    } else if (req->isGroupSegment()) {
+        accessSegment = HSASegment_GROUP;
+    } else if (req->isPrivateSegment()) {
+        accessSegment = HSASegment_PRIVATE;
+    } else if (req->isKernargSegment()) {
+        accessSegment = HSASegment_KERNARG;
+    } else if (req->isReadonlySegment()) {
+        accessSegment = HSASegment_READONLY;
+    } else if (req->isSpillSegment()) {
+        accessSegment = HSASegment_SPILL;
+    } else if (req->isArgSegment()) {
+        accessSegment = HSASegment_ARG;
+    } else {
+        fatal("Bad segment type");
+    }
+
+    return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+    m_store_waiting_on_load_cycles = 0;
+    m_store_waiting_on_store_cycles = 0;
+    m_load_waiting_on_store_cycles = 0;
+    m_load_waiting_on_load_cycles = 0;
+
+    m_outstanding_count = 0;
+
+    m_max_outstanding_requests = 0;
+    m_deadlock_threshold = 0;
+    m_instCache_ptr = nullptr;
+    m_dataCache_ptr = nullptr;
+
+    m_instCache_ptr = p->icache;
+    m_dataCache_ptr = p->dcache;
+    m_max_outstanding_requests = p->max_outstanding_requests;
+    m_deadlock_threshold = p->deadlock_threshold;
+
+    assert(m_max_outstanding_requests > 0);
+    assert(m_deadlock_threshold > 0);
+    assert(m_instCache_ptr);
+    assert(m_dataCache_ptr);
+
+    m_data_cache_hit_latency = p->dcache_hit_latency;
+
+    m_usingNetworkTester = p->using_network_tester;
+    assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+    // Check for deadlock of any of the requests
+    Cycles current_time = curCycle();
+
+    // Check across all outstanding requests
+    int total_outstanding = 0;
+
+    RequestTable::iterator read = m_readRequestTable.begin();
+    RequestTable::iterator read_end = m_readRequestTable.end();
+    for (; read != read_end; ++read) {
+        GPUCoalescerRequest* request = read->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_readRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time)*clockPeriod());
+    }
+
+    RequestTable::iterator write = m_writeRequestTable.begin();
+    RequestTable::iterator write_end = m_writeRequestTable.end();
+    for (; write != write_end; ++write) {
+        GPUCoalescerRequest* request = write->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_writeRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time) * clockPeriod());
+    }
+
+    total_outstanding += m_writeRequestTable.size();
+    total_outstanding += m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    if (m_outstanding_count > 0) {
+        // If there are still outstanding requests, keep checking
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+    m_latencyHist.reset();
+    m_missLatencyHist.reset();
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist[i]->reset();
+        m_missTypeLatencyHist[i]->reset();
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i][j]->reset();
+        }
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist[i]->reset();
+
+        m_IssueToInitialDelayHist[i]->reset();
+        m_InitialToForwardDelayHist[i]->reset();
+        m_ForwardToFirstResponseDelayHist[i]->reset();
+        m_FirstResponseToCompletionDelayHist[i]->reset();
+    }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+        return RequestStatus_BufferFull;
+    }
+
+    if(m_controller->isBlocked(line_addr) &&
+       request_type != RubyRequestType_Locked_RMW_Write) {
+        return RequestStatus_Aliased;
+    }
+
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        // Check if there is any outstanding read request for the same
+        // cache line.
+        if (m_readRequestTable.count(line_addr) > 0) {
+            m_store_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_writeRequestTable.count(line_addr) > 0) {
+          // There is an outstanding write request for the cache line
+          m_store_waiting_on_store_cycles++;
+          return RequestStatus_Aliased;
+        }
+    } else {
+        // Check if there is any outstanding write request for the same
+        // cache line.
+        if (m_writeRequestTable.count(line_addr) > 0) {
+            m_load_waiting_on_store_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_readRequestTable.count(line_addr) > 0) {
+            // There is an outstanding read request for the cache line
+            m_load_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+    }
+
+    return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+    // Don't know if this will happen or is possible
+    // but I just want to be careful and not have it become
+    // simulator hang in the future
+    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+    assert(kernelEndList.count(wavefront_id) == 0);
+
+    kernelEndList[wavefront_id] = pkt;
+    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+            kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table.  Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+           pkt->req->isLockedRMW() ||
+           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+    int total_outstanding M5_VAR_USED =
+        m_writeRequestTable.size() + m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    // See if we should schedule a deadlock check
+    if (deadlockCheckEvent.scheduled() == false) {
+        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+    }
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        pair<RequestTable::iterator, bool> r =
+          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+                                       (GPUCoalescerRequest*) NULL));
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                                curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting write request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    } else {
+        pair<RequestTable::iterator, bool> r =
+            m_readRequestTable.insert(RequestTable::value_type(line_addr,
+                                        (GPUCoalescerRequest*) NULL));
+
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                             curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting read request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    }
+
+    m_outstandReqHist.sample(m_outstanding_count);
+
+    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+    assert(m_outstanding_count == total_outstanding);
+
+    return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+    m_outstanding_count--;
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+
+    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+    if ((srequest->m_type == RubyRequestType_ST) ||
+        (srequest->m_type == RubyRequestType_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_RMW_Write) ||
+        (srequest->m_type == RubyRequestType_Load_Linked) ||
+        (srequest->m_type == RubyRequestType_Store_Conditional) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+        m_writeRequestTable.erase(line_addr);
+    } else {
+        m_readRequestTable.erase(line_addr);
+    }
+
+    markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+    //
+    // The success flag indicates whether the LLSC operation was successful.
+    // LL ops will always succeed, but SC may fail if the cache line is no
+    // longer locked.
+    //
+    bool success = true;
+    if (request->m_type == RubyRequestType_Store_Conditional) {
+        if (!m_dataCache_ptr->isLocked(address, m_version)) {
+            //
+            // For failed SC requests, indicate the failure to the cpu by
+            // setting the extra data to zero.
+            //
+            request->pkt->req->setExtraData(0);
+            success = false;
+        } else {
+            //
+            // For successful SC requests, indicate the success to the cpu by
+            // setting the extra data to one.
+            //
+            request->pkt->req->setExtraData(1);
+        }
+        //
+        // Independent of success, all SC operations must clear the lock
+        //
+        m_dataCache_ptr->clearLocked(address);
+    } else if (request->m_type == RubyRequestType_Load_Linked) {
+        //
+        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+        // previously locked cache lines?
+        //
+        m_dataCache_ptr->setLocked(address, m_version);
+    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+               (m_dataCache_ptr->isLocked(address, m_version))) {
+        //
+        // Normal writes should clear the locked address
+        //
+        m_dataCache_ptr->clearLocked(address);
+    }
+    return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+    writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data)
+{
+    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime)
+{
+    writeCallback(address, mach, data,
+                  initialRequestTime, forwardRequestTime, firstResponseTime,
+                  false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime,
+                         bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_ST) ||
+           (request->m_type == RubyRequestType_ATOMIC) ||
+           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+           (request->m_type == RubyRequestType_RMW_Read) ||
+           (request->m_type == RubyRequestType_RMW_Write) ||
+           (request->m_type == RubyRequestType_Load_Linked) ||
+           (request->m_type == RubyRequestType_Store_Conditional) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+           (request->m_type == RubyRequestType_FLUSH));
+
+
+    //
+    // For Alpha, properly handle LL, SC, and write requests with respect to
+    // locked cache blocks.
+    //
+    // Not valid for Network_test protocl
+    //
+    bool success = true;
+    if(!m_usingNetworkTester)
+        success = handleLlsc(address, request);
+
+    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+        m_controller->unblock(address);
+    }
+
+    hitCallback(request, mach, data, success,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+    readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data)
+{
+    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime)
+{
+
+    readCallback(address, mach, data,
+                 initialRequestTime, forwardRequestTime, firstResponseTime,
+                 false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime,
+                        bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+    assert(m_readRequestTable.count(makeLineAddress(address)));
+
+    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_readRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_LD) ||
+           (request->m_type == RubyRequestType_IFETCH));
+
+    hitCallback(request, mach, data, true,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+                       MachineType mach,
+                       DataBlock& data,
+                       bool success,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion)
+{
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(request_address);
+
+    RubyRequestType type = srequest->m_type;
+
+    // Set this cache entry to the most recently used
+    if (type == RubyRequestType_IFETCH) {
+        if (m_instCache_ptr->isTagPresent(request_line_address))
+            m_instCache_ptr->setMRU(request_line_address);
+    } else {
+        if (m_dataCache_ptr->isTagPresent(request_line_address))
+            m_dataCache_ptr->setMRU(request_line_address);
+    }
+
+    recordMissLatency(srequest, mach,
+                      initialRequestTime,
+                      forwardRequestTime,
+                      firstResponseTime,
+                      success, isRegion);
+    // update the data
+    //
+    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = pkt->getAddr();
+        request_line_address = makeLineAddress(pkt->getAddr());
+        if (pkt->getPtr<uint8_t>()) {
+            if ((type == RubyRequestType_LD) ||
+                (type == RubyRequestType_ATOMIC) ||
+                (type == RubyRequestType_ATOMIC_RETURN) ||
+                (type == RubyRequestType_IFETCH) ||
+                (type == RubyRequestType_RMW_Read) ||
+                (type == RubyRequestType_Locked_RMW_Read) ||
+                (type == RubyRequestType_Load_Linked)) {
+                memcpy(pkt->getPtr<uint8_t>(),
+                       data.getData(getOffset(request_address),
+                                    pkt->getSize()),
+                       pkt->getSize());
+            } else {
+                data.setData(pkt->getPtr<uint8_t>(),
+                             getOffset(request_address), pkt->getSize());
+            }
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+
+
+    completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+    // Check for GPU Barrier Kernel End or Kernel Begin
+    // Leave these to be handled by the child class
+    // Kernel End/Barrier = isFlush + isRelease
+    // Kernel Begin = isFlush + isAcquire
+    if (pkt->req->isKernel()) {
+        if (pkt->req->isAcquire()){
+            // This is a Kernel Begin leave handling to
+            // virtual xCoalescer::makeRequest
+            return RequestStatus_Issued;
+        }else if(pkt->req->isRelease()) {
+            // This is a Kernel End leave handling to
+            // virtual xCoalescer::makeRequest
+            // If we are here then we didn't call
+            // a virtual version of this function
+            // so we will also schedule the callback
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
+            }
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
+            }
+            return RequestStatus_Issued;
+        }
+    }
+
+    // If number of outstanding requests greater than the max allowed,
+    // return RequestStatus_BufferFull. This logic can be extended to
+    // support proper backpressure.
+    if (m_outstanding_count >= m_max_outstanding_requests) {
+        return RequestStatus_BufferFull;
+    }
+
+    RubyRequestType primary_type = RubyRequestType_NULL;
+    RubyRequestType secondary_type = RubyRequestType_NULL;
+
+    if (pkt->isLLSC()) {
+        //
+        // Alpha LL/SC instructions need to be handled carefully by the cache
+        // coherence protocol to ensure they follow the proper semantics. In
+        // particular, by identifying the operations as atomic, the protocol
+        // should understand that migratory sharing optimizations should not
+        // be performed (i.e. a load between the LL and SC should not steal
+        // away exclusive permission).
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Store_Conditional;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Load_Linked;
+        }
+        secondary_type = RubyRequestType_ATOMIC;
+    } else if (pkt->req->isLockedRMW()) {
+        //
+        // x86 locked instructions are translated to store cache coherence
+        // requests because these requests should always be treated as read
+        // exclusive operations and should leverage any migratory sharing
+        // optimization built into the protocol.
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Locked_RMW_Write;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Locked_RMW_Read;
+        }
+        secondary_type = RubyRequestType_ST;
+    } else if (pkt->isAtomicOp()) {
+        //
+        // GPU Atomic Operation
+        //
+        primary_type = RubyRequestType_ATOMIC;
+        secondary_type = RubyRequestType_ATOMIC;
+    } else {
+        if (pkt->isRead()) {
+            if (pkt->req->isInstFetch()) {
+                primary_type = secondary_type = RubyRequestType_IFETCH;
+            } else {
+#if THE_ISA == X86_ISA
+                uint32_t flags = pkt->req->getFlags();
+                bool storeCheck = flags &
+                        (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+                bool storeCheck = false;
+#endif // X86_ISA
+                if (storeCheck) {
+                    primary_type = RubyRequestType_RMW_Read;
+                    secondary_type = RubyRequestType_ST;
+                } else {
+                    primary_type = secondary_type = RubyRequestType_LD;
+                }
+            }
+        } else if (pkt->isWrite()) {
+            //
+            // Note: M5 packets do not differentiate ST from RMW_Write
+            //
+            primary_type = secondary_type = RubyRequestType_ST;
+        } else if (pkt->isFlush()) {
+            primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+            if (assumingRfOCoherence) {
+                // If we reached here, this request must be a memFence
+                // and the protocol implements RfO, the coalescer can
+                // assume sequentially consistency and schedule the callback
+                // immediately.
+                // Currently the code implements fence callbacks
+                // by reusing the mechanism for kernel completions.
+                // This should be fixed.
+                int wf_id = 0;
+                if (pkt->req->hasContextId()) {
+                    wf_id = pkt->req->contextId();
+                }
+                insertKernel(wf_id, pkt);
+                newKernelEnds.push_back(wf_id);
+                if (!issueEvent.scheduled()) {
+                    schedule(issueEvent, curTick());
+                }
+                return RequestStatus_Issued;
+            } else {
+                // If not RfO, return issued here and let the child coalescer
+                // take care of it.
+                return RequestStatus_Issued;
+            }
+        } else {
+            panic("Unsupported ruby packet type\n");
+        }
+    }
+
+    // Check if there is any pending request to this cache line from
+    // previous cycles.
+    // If there is a pending request, return aliased. Since coalescing
+    // across time is not permitted, aliased requests are not coalesced.
+    // If a request for this address has already been issued, we must block
+    RequestStatus status = getRequestStatus(pkt, primary_type);
+    if (status != RequestStatus_Ready)
+        return status;
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Check if this request can be coalesced with previous
+    // requests from this cycle.
+    if (!reqCoalescer.count(line_addr)) {
+        // This is the first access to this cache line.
+        // A new request to the memory subsystem has to be
+        // made in the next cycle for this cache line, so
+        // add this line addr to the "newRequests" queue
+        newRequests.push_back(line_addr);
+
+    // There was a request to this cache line in this cycle,
+    // let us see if we can coalesce this request with the previous
+    // requests from this cycle
+    } else if (primary_type !=
+               reqCoalescer[line_addr][0].second[PrimaryType]) {
+        // can't coalesce loads, stores and atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->isLockedRMW() ||
+               reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+        // can't coalesce locked accesses, but can coalesce atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+               pkt->req->contextId() !=
+               reqCoalescer[line_addr][0].first->req->contextId()) {
+        // can't coalesce releases from different wavefronts
+        return RequestStatus_Aliased;
+    }
+
+    // in addition to the packet, we need to save both request types
+    reqCoalescer[line_addr].push_back(
+            RequestDesc(pkt, std::vector<RubyRequestType>()) );
+    reqCoalescer[line_addr].back().second.push_back(primary_type);
+    reqCoalescer[line_addr].back().second.push_back(secondary_type);
+    if (!issueEvent.scheduled())
+        schedule(issueEvent, curTick());
+    // TODO: issue hardware prefetches here
+    return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+    int proc_id = -1;
+    if (pkt != NULL && pkt->req->hasContextId()) {
+        proc_id = pkt->req->contextId();
+    }
+
+    // If valid, copy the pc to the ruby request
+    Addr pc = 0;
+    if (pkt->req->hasPC()) {
+        pc = pkt->req->getPC();
+    }
+
+    // At the moment setting scopes only counts
+    // for GPU spill space accesses
+    // which is pkt->req->isStack()
+    // this scope is REPLACE since it
+    // does not need to be flushed at the end
+    // of a kernel Private and local may need
+    // to be visible at the end of the kernel
+    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Creating WriteMask that records written bytes
+    // and atomic operations. This enables partial writes
+    // and partial reads of those writes
+    DataBlock dataBlock;
+    dataBlock.clear();
+    uint32_t blockSize = RubySystem::getBlockSizeBytes();
+    std::vector<bool> accessMask(blockSize,false);
+    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+    uint32_t tableSize = reqCoalescer[line_addr].size();
+    for (int i = 0; i < tableSize; i++) {
+        PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+        uint32_t tmpSize = tmpPkt->getSize();
+        if (tmpPkt->isAtomicOp()) {
+            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+                                                        tmpPkt->getAtomicOp());
+            atomicOps.push_back(tmpAtomicOp);
+        } else if(tmpPkt->isWrite()) {
+            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+                              tmpOffset, tmpSize);
+        }
+        for (int j = 0; j < tmpSize; j++) {
+            accessMask[tmpOffset + j] = true;
+        }
+    }
+    std::shared_ptr<RubyRequest> msg;
+    if (pkt->isAtomicOp()) {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, atomicOps,
+                              accessScope, accessSegment);
+    } else {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock,
+                              accessScope, accessSegment);
+    }
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+             curTick(), m_version, "Coal", "Begin", "", "",
+             printAddress(msg->getPhysicalAddress()),
+             RubyRequestType_to_string(secondary_type));
+
+    fatal_if(secondary_type == RubyRequestType_IFETCH,
+             "there should not be any I-Fetch requests in the GPU Coalescer");
+
+    // Send the message to the cache controller
+    fatal_if(m_data_cache_hit_latency == 0,
+             "should not have a latency of zero");
+
+    assert(m_mandatory_q_ptr);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+    out << "[";
+    for (auto i = map.begin(); i != map.end(); ++i)
+        out << " " << i->first << "=" << i->second;
+    out << " ]";
+
+    return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+    out << "[GPUCoalescer: " << m_version
+        << ", outstanding requests: " << m_outstanding_count
+        << ", read request table: " << m_readRequestTable
+        << ", write request table: " << m_writeRequestTable
+        << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+    m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+    DPRINTF(RubyStats, "Recorded statistic: %s\n",
+            SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+    : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+    // newRequests has the cacheline addresses of all the
+    // requests which need to be issued to the memory subsystem
+    // in this cycle
+    int len = newRequests.size();
+    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+    for (int i = 0; i < len; ++i) {
+        // Get the requests from reqCoalescer table. Get only the
+        // first request for each cacheline, the remaining requests
+        // can be coalesced with the first request. So, only
+        // one request is issued per cacheline.
+        RequestDesc info = reqCoalescer[newRequests[i]][0];
+        PacketPtr pkt = info.first;
+        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+                i, pkt->req->getPaddr());
+        // Insert this request to the read/writeRequestTables. These tables
+        // are used to track aliased requests in makeRequest subroutine
+        bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+        if (found) {
+            panic("GPUCoalescer::makeRequest should never be called if the "
+                  "request is already outstanding\n");
+        }
+
+        // Issue request to ruby subsystem
+        issueRequest(pkt, info.second[SecondaryType]);
+    }
+    newRequests.clear();
+
+    // have Kernel End releases been issued this cycle
+    len = newKernelEnds.size();
+    for (int i = 0; i < len; i++) {
+        kernelCallback(newKernelEnds[i]);
+    }
+    newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+    seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+    return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+    ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+    assert(kernelEndList.count(wavefront_id));
+
+    ruby_hit_callback(kernelEndList[wavefront_id]);
+
+    kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+                             MachineType mach,
+                             const DataBlock& data)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* srequest = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+    // Atomics don't write to cache, so there is no MRU update...
+
+    recordMissLatency(srequest, mach,
+                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(srequest->m_type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = (pkt->getAddr());
+        request_line_address = makeLineAddress(request_address);
+        if (pkt->getPtr<uint8_t>() &&
+            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+            /* atomics are done in memory, and return the data *before* the atomic op... */
+            memcpy(pkt->getPtr<uint8_t>(),
+                   data.getData(getOffset(request_address),
+                                pkt->getSize()),
+                   pkt->getSize());
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(srequest->m_type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+    completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPLdHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPLdTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCLdHits++;
+    } else {
+        CP_LdMiss++;
+    }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPStHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPStTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCStHits++;
+    } else {
+        CP_StMiss++;
+    }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+    for (int i = 0; i < len; ++i) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+        MemSlavePort *port = ss->port;
+        assert(port != NULL);
+
+        mylist[i]->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(mylist[i]);
+        trySendRetries();
+    }
+
+    testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+    return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+                                MachineType mach,
+                                Cycles initialRequestTime,
+                                Cycles forwardRequestTime,
+                                Cycles firstResponseTime,
+                                bool success, bool isRegion)
+{
+    RubyRequestType type = srequest->m_type;
+    Cycles issued_time = srequest->issue_time;
+    Cycles completion_time = curCycle();
+    assert(completion_time >= issued_time);
+    Cycles total_lat = completion_time - issued_time;
+
+    // cache stats (valid for RfO protocol only)
+    if (mach == MachineType_TCP) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdHits++;
+        } else {
+            GPU_TCPStHits++;
+        }
+    } else if (mach == MachineType_L1Cache_wCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdTransfers++;
+        } else {
+            GPU_TCPStTransfers++;
+        }
+    } else if (mach == MachineType_TCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCCLdHits++;
+        } else {
+            GPU_TCCStHits++;
+        }
+    } else  {
+        if (type == RubyRequestType_LD) {
+            GPU_LdMiss++;
+        } else {
+            GPU_StMiss++;
+        }
+    }
+
+    // Profile all access latency, even zero latency accesses
+    m_latencyHist.sample(total_lat);
+    m_typeLatencyHist[type]->sample(total_lat);
+
+    // Profile the miss latency for all non-zero demand misses
+    if (total_lat != Cycles(0)) {
+        m_missLatencyHist.sample(total_lat);
+        m_missTypeLatencyHist[type]->sample(total_lat);
+
+        if (mach != MachineType_NUM) {
+            m_missMachLatencyHist[mach]->sample(total_lat);
+            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+            if ((issued_time <= initialRequestTime) &&
+                (initialRequestTime <= forwardRequestTime) &&
+                (forwardRequestTime <= firstResponseTime) &&
+                (firstResponseTime <= completion_time)) {
+
+                m_IssueToInitialDelayHist[mach]->sample(
+                    initialRequestTime - issued_time);
+                m_InitialToForwardDelayHist[mach]->sample(
+                    forwardRequestTime - initialRequestTime);
+                m_ForwardToFirstResponseDelayHist[mach]->sample(
+                    firstResponseTime - forwardRequestTime);
+                m_FirstResponseToCompletionDelayHist[mach]->sample(
+                    completion_time - firstResponseTime);
+            }
+        }
+
+    }
+
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+             curTick(), m_version, "Coal",
+             success ? "Done" : "SC_Failed", "", "",
+             printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+    // These statistical variables are not for display.
+    // The profiler will collate these across different
+    // coalescers and display those collated statistics.
+    m_outstandReqHist.init(10);
+    m_latencyHist.init(10);
+    m_missLatencyHist.init(10);
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist.push_back(new Stats::Histogram());
+        m_typeLatencyHist[i]->init(10);
+
+        m_missTypeLatencyHist.push_back(new Stats::Histogram());
+        m_missTypeLatencyHist[i]->init(10);
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist.push_back(new Stats::Histogram());
+        m_missMachLatencyHist[i]->init(10);
+
+        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+        m_IssueToInitialDelayHist[i]->init(10);
+
+        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+        m_InitialToForwardDelayHist[i]->init(10);
+
+        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+        m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+        m_FirstResponseToCompletionDelayHist[i]->init(10);
+    }
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+            m_missTypeMachLatencyHist[i][j]->init(10);
+        }
+    }
+
+    // GPU cache stats
+    GPU_TCPLdHits
+        .name(name() + ".gpu_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    GPU_TCPLdTransfers
+        .name(name() + ".gpu_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    GPU_TCCLdHits
+        .name(name() + ".gpu_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    GPU_LdMiss
+        .name(name() + ".gpu_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    GPU_TCPStHits
+        .name(name() + ".gpu_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    GPU_TCPStTransfers
+        .name(name() + ".gpu_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    GPU_TCCStHits
+        .name(name() + ".gpu_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    GPU_StMiss
+        .name(name() + ".gpu_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+
+    // CP cache stats
+    CP_TCPLdHits
+        .name(name() + ".cp_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    CP_TCPLdTransfers
+        .name(name() + ".cp_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    CP_TCCLdHits
+        .name(name() + ".cp_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    CP_LdMiss
+        .name(name() + ".cp_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    CP_TCPStHits
+        .name(name() + ".cp_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    CP_TCPStTransfers
+        .name(name() + ".cp_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    CP_TCCStHits
+        .name(name() + ".cp_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    CP_StMiss
+        .name(name() + ".cp_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
new file mode 100644
index 000000000..dbd47059c
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
+#include <iostream>
+#include <unordered_map>
+
+#include "base/statistics.hh"
+#include "mem/protocol/HSAScope.hh"
+#include "mem/protocol/HSASegment.hh"
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/protocol/SequencerRequestType.hh"
+#include "mem/request.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class RubyGPUCoalescerParams;
+
+HSAScope reqScopeToHSAScope(Request* req);
+HSASegment reqSegmentToHSASegment(Request* req);
+
+struct GPUCoalescerRequest
+{
+    PacketPtr pkt;
+    RubyRequestType m_type;
+    Cycles issue_time;
+
+    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
+                        Cycles _issue_time)
+        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
+    {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+
+class GPUCoalescer : public RubyPort
+{
+  public:
+    typedef RubyGPUCoalescerParams Params;
+    GPUCoalescer(const Params *);
+    ~GPUCoalescer();
+
+    // Public Methods
+    void wakeup(); // Used only for deadlock detection
+
+    void printProgress(std::ostream& out) const;
+    void resetStats();
+    void collateStats();
+    void regStats();
+
+    void writeCallback(Addr address, DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime);
+
+    void readCallback(Addr address, DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime,
+                      bool isRegion);
+    /* atomics need their own callback because the data
+       might be const coming from SLICC */
+    void atomicCallback(Addr address,
+                        MachineType mach,
+                        const DataBlock& data);
+
+    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
+    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
+
+    // Alternate implementations in VIPER Coalescer
+    virtual RequestStatus makeRequest(PacketPtr pkt);
+
+    int outstandingCount() const { return m_outstanding_count; }
+
+    bool
+    isDeadlockEventScheduled() const
+    {
+        return deadlockCheckEvent.scheduled();
+    }
+
+    void
+    descheduleDeadlockEvent()
+    {
+        deschedule(deadlockCheckEvent);
+    }
+
+    bool empty() const;
+
+    void print(std::ostream& out) const;
+    void checkCoherence(Addr address);
+
+    void markRemoved();
+    void removeRequest(GPUCoalescerRequest* request);
+    void evictionCallback(Addr address);
+    void completeIssue();
+
+    void insertKernel(int wavefront_id, PacketPtr pkt);
+
+    void recordRequestType(SequencerRequestType requestType);
+    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
+
+    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
+    Stats::Histogram& getTypeLatencyHist(uint32_t t)
+    { return *m_typeLatencyHist[t]; }
+
+    Stats::Histogram& getMissLatencyHist()
+    { return m_missLatencyHist; }
+    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
+    { return *m_missTypeLatencyHist[t]; }
+
+    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
+    { return *m_missMachLatencyHist[t]; }
+
+    Stats::Histogram&
+    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
+    { return *m_missTypeMachLatencyHist[r][t]; }
+
+    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
+    { return *m_IssueToInitialDelayHist[t]; }
+
+    Stats::Histogram&
+    getInitialToForwardDelayHist(const MachineType t) const
+    { return *m_InitialToForwardDelayHist[t]; }
+
+    Stats::Histogram&
+    getForwardRequestToFirstResponseHist(const MachineType t) const
+    { return *m_ForwardToFirstResponseDelayHist[t]; }
+
+    Stats::Histogram&
+    getFirstResponseToCompletionDelayHist(const MachineType t) const
+    { return *m_FirstResponseToCompletionDelayHist[t]; }
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    bool tryCacheAccess(Addr addr, RubyRequestType type,
+                        Addr pc, RubyAccessMode access_mode,
+                        int size, DataBlock*& data_ptr);
+    // Alternate implementations in VIPER Coalescer
+    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+
+    void kernelCallback(int wavfront_id);
+
+    void hitCallback(GPUCoalescerRequest* request,
+                     MachineType mach,
+                     DataBlock& data,
+                     bool success,
+                     Cycles initialRequestTime,
+                     Cycles forwardRequestTime,
+                     Cycles firstResponseTime,
+                     bool isRegion);
+    void recordMissLatency(GPUCoalescerRequest* request,
+                           MachineType mach,
+                           Cycles initialRequestTime,
+                           Cycles forwardRequestTime,
+                           Cycles firstResponseTime,
+                           bool success, bool isRegion);
+    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
+    PacketPtr mapAddrToPkt(Addr address);
+
+
+    RequestStatus getRequestStatus(PacketPtr pkt,
+                                   RubyRequestType request_type);
+    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+
+    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+
+    // Private copy constructor and assignment operator
+    GPUCoalescer(const GPUCoalescer& obj);
+    GPUCoalescer& operator=(const GPUCoalescer& obj);
+
+    class IssueEvent : public Event
+    {
+      private:
+        GPUCoalescer *seq;
+      public:
+        IssueEvent(GPUCoalescer *_seq);
+        void process();
+        const char *description() const;
+    };
+
+    IssueEvent issueEvent;
+
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    int m_max_outstanding_requests;
+    int m_deadlock_threshold;
+
+    CacheMemory* m_dataCache_ptr;
+    CacheMemory* m_instCache_ptr;
+
+    // The cache access latency for this GPU data cache. This is assessed at the
+    // beginning of each access. This should be very similar to the
+    // implementation in Sequencer() as this is very much like a Sequencer
+    Cycles m_data_cache_hit_latency;
+
+    // We need to track both the primary and secondary request types.
+    // The secondary request type comprises a subset of RubyRequestTypes that
+    // are understood by the L1 Controller. A primary request type can be any
+    // RubyRequestType.
+    enum {PrimaryType, SecondaryType};
+    typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
+    typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
+    CoalescingTable reqCoalescer;
+    std::vector<Addr> newRequests;
+
+    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
+    RequestTable m_writeRequestTable;
+    RequestTable m_readRequestTable;
+    // Global outstanding request count, across all request tables
+    int m_outstanding_count;
+    bool m_deadlock_check_scheduled;
+    std::unordered_map<int, PacketPtr> kernelEndList;
+    std::vector<int> newKernelEnds;
+
+    int m_store_waiting_on_load_cycles;
+    int m_store_waiting_on_store_cycles;
+    int m_load_waiting_on_store_cycles;
+    int m_load_waiting_on_load_cycles;
+
+    bool m_usingNetworkTester;
+
+    class GPUCoalescerWakeupEvent : public Event
+    {
+      private:
+        GPUCoalescer *m_GPUCoalescer_ptr;
+
+      public:
+        GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
+            m_GPUCoalescer_ptr(_seq) {}
+        void process() { m_GPUCoalescer_ptr->wakeup(); }
+        const char *description() const
+        {
+            return "GPUCoalescer deadlock check";
+        }
+    };
+
+    GPUCoalescerWakeupEvent deadlockCheckEvent;
+    bool assumingRfOCoherence;
+
+    // m5 style stats for TCP hit/miss counts
+    Stats::Scalar GPU_TCPLdHits;
+    Stats::Scalar GPU_TCPLdTransfers;
+    Stats::Scalar GPU_TCCLdHits;
+    Stats::Scalar GPU_LdMiss;
+
+    Stats::Scalar GPU_TCPStHits;
+    Stats::Scalar GPU_TCPStTransfers;
+    Stats::Scalar GPU_TCCStHits;
+    Stats::Scalar GPU_StMiss;
+
+    Stats::Scalar CP_TCPLdHits;
+    Stats::Scalar CP_TCPLdTransfers;
+    Stats::Scalar CP_TCCLdHits;
+    Stats::Scalar CP_LdMiss;
+
+    Stats::Scalar CP_TCPStHits;
+    Stats::Scalar CP_TCPStTransfers;
+    Stats::Scalar CP_TCCStHits;
+    Stats::Scalar CP_StMiss;
+
+    //! Histogram for number of outstanding requests per cycle.
+    Stats::Histogram m_outstandReqHist;
+
+    //! Histogram for holding latency profile of all requests.
+    Stats::Histogram m_latencyHist;
+    std::vector<Stats::Histogram *> m_typeLatencyHist;
+
+    //! Histogram for holding latency profile of all requests that
+    //! miss in the controller connected to this sequencer.
+    Stats::Histogram m_missLatencyHist;
+    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
+
+    //! Histograms for profiling the latencies for requests that
+    //! required external messages.
+    std::vector<Stats::Histogram *> m_missMachLatencyHist;
+    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
+
+    //! Histograms for recording the breakdown of miss latency
+    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
+    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
+    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
+    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const GPUCoalescer& obj)
+{
+    obj.print(out);
+    out << std::flush;
+    return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
new file mode 100644
index 000000000..0c19f875d
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -0,0 +1,48 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from Sequencer import *
+
+class RubyGPUCoalescer(RubySequencer):
+   type = 'RubyGPUCoalescer'
+   cxx_class = 'GPUCoalescer'
+   cxx_header = "mem/ruby/system/GPUCoalescer.hh"
+
+   # max_outstanding_requests = (wave front slots) x (wave front size)
+   max_outstanding_requests = Param.Int(40*64,
+                                "max requests (incl. prefetches) outstanding")
+   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
+                           "Ownership coherence");
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5a5f528bb..bf4002126 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p)
       memSlavePort(csprintf("%s-mem-slave-port", name()), this,
                    p->ruby_system->getAccessBackingStore(), -1,
                    p->no_retry_on_stall),
-      gotAddrRanges(p->port_master_connection_count)
+      gotAddrRanges(p->port_master_connection_count),
+      m_isCPUSequencer(p->is_cpu_sequencer)
 {
     assert(m_version != -1);
 
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 07e0fde5a..6bd92b654 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -167,6 +167,8 @@ class RubyPort : public MemObject
     uint32_t getId() { return m_version; }
     DrainState drain() override;
 
+    bool isCPUSequencer() { return m_isCPUSequencer; }
+
   protected:
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
@@ -218,6 +220,8 @@ class RubyPort : public MemObject
     // that should be called when the Sequencer becomes available after a stall.
     //
     std::vector<MemSlavePort *> retryList;
+
+    bool m_isCPUSequencer;
 };
 
 #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 1ecd2e098..e1717e519 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
     Sequencer* sequencer_ptr = NULL;
 
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
         if (sequencer_ptr == NULL) {
             sequencer_ptr = sequencer_map[cntrl];
         }
diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 8c5077362..b67311bca 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -33,12 +33,22 @@ Import('*')
 if env['PROTOCOL'] == 'None':
     Return()
 
+if env['BUILD_GPU']:
+    SimObject('GPUCoalescer.py')
 SimObject('RubySystem.py')
 SimObject('Sequencer.py')
+SimObject('WeightedLRUReplacementPolicy.py')
+if env['BUILD_GPU']:
+    SimObject('VIPERCoalescer.py')
 
 Source('CacheRecorder.cc')
 Source('DMASequencer.cc')
+if env['BUILD_GPU']:
+    Source('GPUCoalescer.cc')
 Source('RubyPort.cc')
 Source('RubyPortProxy.cc')
 Source('RubySystem.cc')
 Source('Sequencer.cc')
+if env['BUILD_GPU']:
+    Source('VIPERCoalescer.cc')
+Source('WeightedLRUPolicy.cc')
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 50418c700..c2727b41d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p)
     m_max_outstanding_requests = p->max_outstanding_requests;
     m_deadlock_threshold = p->deadlock_threshold;
 
+    m_coreId = p->coreid; // for tracking the two CorePair sequencers
     assert(m_max_outstanding_requests > 0);
     assert(m_deadlock_threshold > 0);
     assert(m_instCache_ptr != NULL);
@@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     ContextID proc_id = pkt->req->hasContextId() ?
         pkt->req->contextId() : InvalidContextID;
 
+    ContextID core_id = coreId();
+
     // If valid, copy the pc to the ruby request
     Addr pc = 0;
     if (pkt->req->hasPC()) {
@@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                                       nullptr : pkt->getPtr<uint8_t>(),
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
-                                      PrefetchBit_No, proc_id);
+                                      PrefetchBit_No, proc_id, core_id);
 
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
             curTick(), m_version, "Seq", "Begin", "", "",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 47af7ea1e..2a2f49587 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -99,6 +99,7 @@ class Sequencer : public RubyPort
     void markRemoved();
     void evictionCallback(Addr address);
     void invalidateSC(Addr address);
+    int coreId() const { return m_coreId; }
 
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
@@ -198,6 +199,8 @@ class Sequencer : public RubyPort
     Stats::Scalar m_load_waiting_on_store;
     Stats::Scalar m_load_waiting_on_load;
 
+    int m_coreId;
+
     bool m_usingNetworkTester;
 
     //! Histogram for number of outstanding requests per cycle.
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 7c90eb29c..d6ee0aa2f 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -32,54 +32,58 @@ from m5.proxy import *
 from MemObject import MemObject
 
 class RubyPort(MemObject):
-    type = 'RubyPort'
-    abstract = True
-    cxx_header = "mem/ruby/system/RubyPort.hh"
-    version = Param.Int(0, "")
+   type = 'RubyPort'
+   abstract = True
+   cxx_header = "mem/ruby/system/RubyPort.hh"
+   version = Param.Int(0, "")
 
-    slave = VectorSlavePort("CPU slave port")
-    master = VectorMasterPort("CPU master port")
-    pio_master_port = MasterPort("Ruby mem master port")
-    mem_master_port = MasterPort("Ruby mem master port")
-    pio_slave_port = SlavePort("Ruby pio slave port")
-    mem_slave_port = SlavePort("Ruby memory port")
+   slave = VectorSlavePort("CPU slave port")
+   master = VectorMasterPort("CPU master port")
+   pio_master_port = MasterPort("Ruby mem master port")
+   mem_master_port = MasterPort("Ruby mem master port")
+   pio_slave_port = SlavePort("Ruby pio slave port")
+   mem_slave_port = SlavePort("Ruby memory port")
 
-    using_ruby_tester = Param.Bool(False, "")
-    no_retry_on_stall = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
-    support_data_reqs = Param.Bool(True, "data cache requests supported")
-    support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   using_ruby_tester = Param.Bool(False, "")
+   no_retry_on_stall = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
+   support_data_reqs = Param.Bool(True, "data cache requests supported")
+   support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   is_cpu_sequencer = Param.Bool(True, "connected to a cpu")
 
 class RubyPortProxy(RubyPort):
-    type = 'RubyPortProxy'
-    cxx_header = "mem/ruby/system/RubyPortProxy.hh"
+   type = 'RubyPortProxy'
+   cxx_header = "mem/ruby/system/RubyPortProxy.hh"
 
 class RubySequencer(RubyPort):
-    type = 'RubySequencer'
-    cxx_class = 'Sequencer'
-    cxx_header = "mem/ruby/system/Sequencer.hh"
+   type = 'RubySequencer'
+   cxx_class = 'Sequencer'
+   cxx_header = "mem/ruby/system/Sequencer.hh"
 
-    icache = Param.RubyCache("")
-    dcache = Param.RubyCache("")
-    # Cache latencies currently assessed at the beginning of each access
-    # NOTE: Setting these values to a value greater than one will result in
-    # O3 CPU pipeline bubbles and negatively impact performance
-    # TODO: Latencies should be migrated into each top-level cache controller
-    icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
-    dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
-    max_outstanding_requests = Param.Int(16,
-        "max requests (incl. prefetches) outstanding")
-    deadlock_threshold = Param.Cycles(500000,
-        "max outstanding cycles for a request before deadlock/livelock declared")
-    using_network_tester = Param.Bool(False, "")
+   icache = Param.RubyCache("")
+   dcache = Param.RubyCache("")
+   # Cache latencies currently assessed at the beginning of each access
+   # NOTE: Setting these values to a value greater than one will result in
+   # O3 CPU pipeline bubbles and negatively impact performance
+   # TODO: Latencies should be migrated into each top-level cache controller
+   icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
+   dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
+   max_outstanding_requests = Param.Int(16,
+       "max requests (incl. prefetches) outstanding")
+   deadlock_threshold = Param.Cycles(500000,
+       "max outstanding cycles for a request before deadlock/livelock declared")
+   using_network_tester = Param.Bool(False, "")
+   # id used by protocols that support multiple sequencers per controller
+   # 99 is the dummy default value
+   coreid = Param.Int(99, "CorePair core id")
 
 class DMASequencer(MemObject):
-    type = 'DMASequencer'
-    cxx_header = "mem/ruby/system/DMASequencer.hh"
+   type = 'DMASequencer'
+   cxx_header = "mem/ruby/system/DMASequencer.hh"
 
-    version = Param.Int(0, "")
-    slave = SlavePort("Device slave port")
-    using_ruby_tester = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
+   version = Param.Int(0, "")
+   slave = SlavePort("Device slave port")
+   using_ruby_tester = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
new file mode 100644
index 000000000..ca91f2723
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/VIPERCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/VIPERCoalescer.hh"
+
+using namespace std;
+
+VIPERCoalescer *
+VIPERCoalescerParams::create()
+{
+    return new VIPERCoalescer(this);
+}
+
+VIPERCoalescer::VIPERCoalescer(const Params *p)
+    : GPUCoalescer(p)
+{
+    m_max_wb_per_cycle=p->max_wb_per_cycle;
+    m_max_inv_per_cycle=p->max_inv_per_cycle;
+    m_outstanding_inv = 0;
+    m_outstanding_wb = 0;
+}
+
+VIPERCoalescer::~VIPERCoalescer()
+{
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+VIPERCoalescer::makeRequest(PacketPtr pkt)
+{
+    if (m_outstanding_wb | m_outstanding_inv) {
+        DPRINTF(GPUCoalescer,
+                "There are %d Writebacks and %d Invalidatons\n",
+                m_outstanding_wb, m_outstanding_inv);
+    }
+    // Are we in the middle of a release
+    if ((m_outstanding_wb) > 0) {
+        if (pkt->req->isKernel()) {
+            // Everythign is fine
+            // Barriers and Kernel End scan coalesce
+            // If it is a Kerenl Begin flush the cache
+            if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
+                invL1();
+            }
+
+            if (pkt->req->isRelease()) {
+                insertKernel(pkt->req->contextId(), pkt);
+            }
+
+            return RequestStatus_Issued;
+        }
+//        return RequestStatus_Aliased;
+    } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        // Flush Dirty Data on Kernel End
+        // isKernel + isRelease
+        insertKernel(pkt->req->contextId(), pkt);
+        wbL1();
+        if(m_outstanding_wb == 0) {
+            for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+                newKernelEnds.push_back(it->first);
+            }
+            completeIssue();
+        }
+        return RequestStatus_Issued;
+    }
+    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
+    if (requestStatus!=RequestStatus_Issued) {
+        // Request not isssued
+        // enqueue Retry
+        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
+        return requestStatus;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        // Invalidate clean Data on Kernel Begin
+        // isKernel + isAcquire
+        invL1();
+    } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
+        // Deschedule the AtomicAcqRel and
+        // Flush and Invalidate the L1 cache
+        invwbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isRelease()) {
+        // Deschedule the StoreRel and
+        // Flush the L1 cache
+        wbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isAcquire()) {
+        // LoadAcq or AtomicAcq
+        // Invalidate the L1 cache
+        invL1();
+    }
+    // Request was successful
+    if (m_outstanding_wb == 0) {
+        if (!issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
+            schedule(issueEvent, curTick());
+        }
+    }
+    return RequestStatus_Issued;
+}
+
+void
+VIPERCoalescer::wbCallback(Addr addr)
+{
+    m_outstanding_wb--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    assert(((int) m_outstanding_wb) >= 0);
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+void
+VIPERCoalescer::invCallback(Addr addr)
+{
+    m_outstanding_inv--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    // This probably won't happen, since
+    // we dont wait on cache invalidations
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+/**
+  * Invalidate L1 cache (Acquire)
+  */
+void
+VIPERCoalescer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_outstanding_inv);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidatons outstanding after Cache Walk\n",
+            m_outstanding_inv);
+}
+
+/**
+  * Writeback L1 cache (Release)
+  */
+void
+VIPERCoalescer::wbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding before Cache Walk\n",
+            m_outstanding_wb);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding after Cache Walk\n",
+            m_outstanding_wb);
+}
+
+/**
+  * Invalidate and Writeback L1 cache (Acquire&Release)
+  */
+void
+VIPERCoalescer::invwbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    // Walk the cache
+    for(int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    // Walk the cache
+    for(int i = 0; i< size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+}
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
new file mode 100644
index 000000000..af6e44e7f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
+#include <iostream>
+
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class VIPERCoalescerParams;
+
+class VIPERCoalescer : public GPUCoalescer
+{
+  public:
+    typedef VIPERCoalescerParams Params;
+    VIPERCoalescer(const Params *);
+    ~VIPERCoalescer();
+    void wbCallback(Addr address);
+    void invCallback(Addr address);
+    RequestStatus makeRequest(PacketPtr pkt);
+  private:
+    void invL1();
+    void wbL1();
+    void invwbL1();
+    uint64_t m_outstanding_inv;
+    uint64_t m_outstanding_wb;
+    uint64_t m_max_inv_per_cycle;
+    uint64_t m_max_wb_per_cycle;
+};
+#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
new file mode 100644
index 000000000..05c74386f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -0,0 +1,45 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from GPUCoalescer import *
+
+class VIPERCoalescer(RubyGPUCoalescer):
+    type = 'VIPERCoalescer'
+    cxx_class = 'VIPERCoalescer'
+    cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
+    max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
+    max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
+    assume_rfo = False
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc
new file mode 100644
index 000000000..5baa4d9a5
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Derek Hower
+ */
+
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
+
+WeightedLRUPolicy::WeightedLRUPolicy(const Params* p)
+    : AbstractReplacementPolicy(p), m_cache(p->cache)
+{
+    m_last_occ_ptr = new int*[m_num_sets];
+    for(unsigned i = 0; i < m_num_sets; i++){
+        m_last_occ_ptr[i] = new int[m_assoc];
+        for(unsigned j = 0; j < m_assoc; j++){
+            m_last_occ_ptr[i][j] = 0;
+        }
+    }
+}
+
+WeightedLRUPolicy *
+WeightedLRUReplacementPolicyParams::create()
+{
+    return new WeightedLRUPolicy(this);
+}
+
+WeightedLRUPolicy::~WeightedLRUPolicy()
+{
+    if (m_last_occ_ptr != NULL){
+        for (unsigned i = 0; i < m_num_sets; i++){
+            if (m_last_occ_ptr[i] != NULL){
+                delete[] m_last_occ_ptr[i];
+            }
+        }
+        delete[] m_last_occ_ptr;
+    }
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+    m_last_occ_ptr[set][index] = occupancy;
+}
+
+int64_t
+WeightedLRUPolicy::getVictim(int64_t set) const
+{
+    Tick time, smallest_time;
+    int64_t smallest_index;
+
+    smallest_index = 0;
+    smallest_time = m_last_ref_ptr[set][0];
+    int smallest_weight = m_last_ref_ptr[set][0];
+
+    for (unsigned i = 1; i < m_assoc; i++) {
+
+        int weight = m_last_occ_ptr[set][i];
+        if (weight < smallest_weight) {
+            smallest_weight = weight;
+            smallest_index = i;
+            smallest_time = m_last_ref_ptr[set][i];
+        } else if (weight == smallest_weight) {
+            time = m_last_ref_ptr[set][i];
+            if (time < smallest_time) {
+                smallest_index = i;
+                smallest_time = time;
+            }
+        }
+    }
+    return smallest_index;
+}
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh
new file mode 100644
index 000000000..3150779b2
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+
+#include "mem/ruby/structures/AbstractReplacementPolicy.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "params/WeightedLRUReplacementPolicy.hh"
+
+/* Simple true LRU replacement policy */
+
+class WeightedLRUPolicy : public AbstractReplacementPolicy
+{
+  public:
+    typedef WeightedLRUReplacementPolicyParams Params;
+    WeightedLRUPolicy(const Params* p);
+    ~WeightedLRUPolicy();
+
+    void touch(int64_t set, int64_t way, Tick time);
+    void touch(int64_t set, int64_t way, Tick time, int occupancy);
+    int64_t getVictim(int64_t set) const override;
+
+    bool useOccupancy() const { return true; }
+
+    CacheMemory * m_cache;
+    int **m_last_occ_ptr;
+};
+
+#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__
diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
new file mode 100644
index 000000000..e7de33496
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
@@ -0,0 +1,45 @@
+#
+#  Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Derek Hower
+#
+
+from m5.params import *
+from m5.proxy import *
+from MemObject import MemObject
+from ReplacementPolicy import ReplacementPolicy
+
+class WeightedLRUReplacementPolicy(ReplacementPolicy):
+    type = "WeightedLRUReplacementPolicy"
+    cxx_class = "WeightedLRUPolicy"
+    cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh"
+    cache = Param.RubyCache("")
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index a530307ee..fc3f32c3d 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -35,13 +35,17 @@ import re
 
 python_class_map = {
                     "int": "Int",
+                    "NodeID": "Int",
                     "uint32_t" : "UInt32",
                     "std::string": "String",
                     "bool": "Bool",
                     "CacheMemory": "RubyCache",
                     "WireBuffer": "RubyWireBuffer",
                     "Sequencer": "RubySequencer",
+                    "GPUCoalescer" : "RubyGPUCoalescer",
+                    "VIPERCoalescer" : "VIPERCoalescer",
                     "DirectoryMemory": "RubyDirectoryMemory",
+                    "PerfectCacheMemory": "RubyPerfectCacheMemory",
                     "MemoryControl": "MemoryControl",
                     "MessageBuffer": "MessageBuffer",
                     "DMASequencer": "DMASequencer",
@@ -305,7 +309,7 @@ class $c_ident : public AbstractController
     void collateStats();
 
     void recordCacheTrace(int cntrl, CacheRecorder* tr);
-    Sequencer* getSequencer() const;
+    Sequencer* getCPUSequencer() const;
 
     int functionalWriteBuffers(PacketPtr&);
 
@@ -527,8 +531,14 @@ $c_ident::$c_ident(const Params *p)
             else:
                 code('m_${{param.ident}} = p->${{param.ident}};')
 
-            if re.compile("sequencer").search(param.ident):
-                code('m_${{param.ident}}_ptr->setController(this);')
+            if re.compile("sequencer").search(param.ident) or \
+                   param.type_ast.type.c_ident == "GPUCoalescer" or \
+                   param.type_ast.type.c_ident == "VIPERCoalescer":
+                code('''
+if (m_${{param.ident}}_ptr != NULL) {
+    m_${{param.ident}}_ptr->setController(this);
+}
+''')
 
         code('''
 
@@ -670,6 +680,28 @@ $c_ident::init()
                 assert(param.pointer)
                 seq_ident = "m_%s_ptr" % param.ident
 
+        if seq_ident != "NULL":
+            code('''
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+    if (NULL != $seq_ident && $seq_ident->isCPUSequencer()) {
+        return $seq_ident;
+    } else {
+        return NULL;
+    }
+}
+''')
+        else:
+            code('''
+
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+    return NULL;
+}
+''')
+
         code('''
 
 void
@@ -796,12 +828,6 @@ $c_ident::getMemoryQueue() const
     return $memq_ident;
 }
 
-Sequencer*
-$c_ident::getSequencer() const
-{
-    return $seq_ident;
-}
-
 void
 $c_ident::print(ostream& out) const
 {
author	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-01-19 14:28:22 -0500
committer	Tony Gutierrez <anthony.gutierrez@amd.com>	2016-01-19 14:28:22 -0500
commit	1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree	867510a147cd095f19499d26b7c02d27de4cae9d /src
parent	28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
download	gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz